## import os
import math
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from scipy import stats as st
from scipy.stats import linregress
import scipy.optimize as optimize
import scipy.sparse as sparse
import scipy.special as special
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
import warnings
warnings.filterwarnings('ignore') #suppress warning messages (those with peach background)
import statsmodels.api as sm
from statsmodels.formula.api import ols
#read csv file as a pandas dataframe
df = pd.read_csv('Covid-19-UK.csv', index_col=None) #read csv file
print('Imported Dataset from Covid-19-UK.csv:')
df #show dataframe
Imported Dataset from Covid-19-UK.csv:
| date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE100 Open Price | Close Price | High Price | Low Price | Volume | temperature max degC | tmin degC | rain mm | sun hours | U.S. cases | U.S. deaths | https://coronavirus.data.gov.uk/details/download | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | https://sharecast.com/index/FTSE_100/prices/do... |
| 1 | 2020-01-22 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | https://www.metoffice.gov.uk/pub/data/weather/... |
| 2 | 2020-01-23 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | https://github.com/nytimes/covid-19-data/blob/... |
| 3 | 2020-01-24 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | NaN |
| 4 | 2020-01-25 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | 0.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 2021-03-30 | 4341736.0 | 122824208.0 | NaN | 4125884.0 | NaN | 6736.17 | 6772.12 | 6792.23 | 6729.46 | 654363968.0 | NaN | NaN | NaN | NaN | 30416970.0 | 550500.0 | NaN |
| 435 | 2021-03-31 | 4345788.0 | 124147198.0 | NaN | 4513458.0 | NaN | 6772.12 | 6713.63 | 6775.67 | 6713.63 | 837940608.0 | 12.1 | 2.8 | 37.0 | 116.3 | 30485232.0 | 551638.0 | NaN |
| 436 | 2021-04-01 | 4350266.0 | NaN | NaN | 4958874.0 | NaN | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | NaN | NaN | NaN | NaN | 30562856.0 | 552593.0 | NaN |
| 437 | 2021-04-02 | 4353668.0 | NaN | NaN | 5205505.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 30631700.0 | 553554.0 | NaN |
| 438 | 2021-04-03 | 4357091.0 | NaN | NaN | 5381745.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
439 rows × 18 columns
#we remove last column of hyperlinks
df = df.iloc[:, :-1]
df1 = df.copy()
df
| date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE100 Open Price | Close Price | High Price | Low Price | Volume | temperature max degC | tmin degC | rain mm | sun hours | U.S. cases | U.S. deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 |
| 1 | 2020-01-22 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 |
| 2 | 2020-01-23 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 |
| 3 | 2020-01-24 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 |
| 4 | 2020-01-25 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 2021-03-30 | 4341736.0 | 122824208.0 | NaN | 4125884.0 | NaN | 6736.17 | 6772.12 | 6792.23 | 6729.46 | 654363968.0 | NaN | NaN | NaN | NaN | 30416970.0 | 550500.0 |
| 435 | 2021-03-31 | 4345788.0 | 124147198.0 | NaN | 4513458.0 | NaN | 6772.12 | 6713.63 | 6775.67 | 6713.63 | 837940608.0 | 12.1 | 2.8 | 37.0 | 116.3 | 30485232.0 | 551638.0 |
| 436 | 2021-04-01 | 4350266.0 | NaN | NaN | 4958874.0 | NaN | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | NaN | NaN | NaN | NaN | 30562856.0 | 552593.0 |
| 437 | 2021-04-02 | 4353668.0 | NaN | NaN | 5205505.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 30631700.0 | 553554.0 |
| 438 | 2021-04-03 | 4357091.0 | NaN | NaN | 5381745.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
439 rows × 17 columns
# Import SPY ETF Data for time range
spy = pd.read_csv("SPY.csv")
# Introduce 3 new columns, as per limit issued by Prof
spy = spy[["Date", "Open", "Close"]] # Drop the extra columns
spy.columns = ["date", "Open", "Close"] # Rename
spy.head(20)
| date | Open | Close | |
|---|---|---|---|
| 0 | 2020-01-21 | 330.899994 | 331.299988 |
| 1 | 2020-01-22 | 332.239990 | 331.339996 |
| 2 | 2020-01-23 | 330.630005 | 331.720001 |
| 3 | 2020-01-24 | 332.440002 | 328.769989 |
| 4 | 2020-01-27 | 323.029999 | 323.500000 |
| 5 | 2020-01-28 | 325.059998 | 326.890015 |
| 6 | 2020-01-29 | 328.380005 | 326.619995 |
| 7 | 2020-01-30 | 324.359985 | 327.679993 |
| 8 | 2020-01-31 | 327.000000 | 321.730011 |
| 9 | 2020-02-03 | 323.350006 | 324.119995 |
| 10 | 2020-02-04 | 328.070007 | 329.059998 |
| 11 | 2020-02-05 | 332.269989 | 332.859985 |
| 12 | 2020-02-06 | 333.910004 | 333.980011 |
| 13 | 2020-02-07 | 332.820007 | 332.200012 |
| 14 | 2020-02-10 | 331.230011 | 334.679993 |
| 15 | 2020-02-11 | 336.160004 | 335.260010 |
| 16 | 2020-02-12 | 336.829987 | 337.420013 |
| 17 | 2020-02-13 | 335.859985 | 337.059998 |
| 18 | 2020-02-14 | 337.510010 | 337.600006 |
| 19 | 2020-02-18 | 336.510010 | 336.730011 |
# Join df and spy on date
# Convert to suitable type
spy["date"] = spy["date"].astype('datetime64[ns]')
df["date"] = df['date'].astype('datetime64[ns]')
# Merge on column date
df = df.merge(spy, on="date", how="left")
# Print final data frame
df
| date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE100 Open Price | Close Price | High Price | Low Price | Volume | temperature max degC | tmin degC | rain mm | sun hours | U.S. cases | U.S. deaths | Open | Close | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | 330.899994 | 331.299988 |
| 1 | 2020-01-22 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | 332.239990 | 331.339996 |
| 2 | 2020-01-23 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | 330.630005 | 331.720001 |
| 3 | 2020-01-24 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 332.440002 | 328.769989 |
| 4 | 2020-01-25 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | 0.0 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 2021-03-30 | 4341736.0 | 122824208.0 | NaN | 4125884.0 | NaN | 6736.17 | 6772.12 | 6792.23 | 6729.46 | 654363968.0 | NaN | NaN | NaN | NaN | 30416970.0 | 550500.0 | 394.420013 | 394.730011 |
| 435 | 2021-03-31 | 4345788.0 | 124147198.0 | NaN | 4513458.0 | NaN | 6772.12 | 6713.63 | 6775.67 | 6713.63 | 837940608.0 | 12.1 | 2.8 | 37.0 | 116.3 | 30485232.0 | 551638.0 | 395.339996 | 396.329987 |
| 436 | 2021-04-01 | 4350266.0 | NaN | NaN | 4958874.0 | NaN | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | NaN | NaN | NaN | NaN | 30562856.0 | 552593.0 | 398.399994 | 400.609985 |
| 437 | 2021-04-02 | 4353668.0 | NaN | NaN | 5205505.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 30631700.0 | 553554.0 | NaN | NaN |
| 438 | 2021-04-03 | 4357091.0 | NaN | NaN | 5381745.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
439 rows × 19 columns
#Show the distribution of numerical features in the dataset
df.hist(figsize=(16,16))
array([[<AxesSubplot:title={'center':'date'}>,
<AxesSubplot:title={'center':'cumCasesByPublishDate'}>,
<AxesSubplot:title={'center':'cumVirusTests'}>,
<AxesSubplot:title={'center':'cumDailyNsoDeathsByDeathDate'}>],
[<AxesSubplot:title={'center':'cumPeopleVaccinatedCompleteByPublishDate'}>,
<AxesSubplot:title={'center':'cumAdmissions'}>,
<AxesSubplot:title={'center':'FTSE100 Open Price'}>,
<AxesSubplot:title={'center':'Close Price'}>],
[<AxesSubplot:title={'center':'High Price'}>,
<AxesSubplot:title={'center':'Low Price'}>,
<AxesSubplot:title={'center':'Volume'}>,
<AxesSubplot:title={'center':'temperature max degC'}>],
[<AxesSubplot:title={'center':'tmin degC'}>,
<AxesSubplot:title={'center':'rain mm'}>,
<AxesSubplot:title={'center':'sun hours'}>,
<AxesSubplot:title={'center':'U.S. cases'}>],
[<AxesSubplot:title={'center':'U.S. deaths'}>,
<AxesSubplot:title={'center':'Open'}>,
<AxesSubplot:title={'center':'Close'}>, <AxesSubplot:>]],
dtype=object)
# Breakdown of Total missing values (Null values) for each feature
# Count total NaN at each column in a DataFrame
print(" \nCount total missing values (NaN) at each column in cleaned dataFrame :")
df.isnull().sum()
Count total missing values (NaN) at each column in cleaned dataFrame :
date 0 cumCasesByPublishDate 10 cumVirusTests 93 cumDailyNsoDeathsByDeathDate 24 cumPeopleVaccinatedCompleteByPublishDate 355 cumAdmissions 68 FTSE100 Open Price 142 Close Price 142 High Price 142 Low Price 142 Volume 142 temperature max degC 424 tmin degC 424 rain mm 424 sun hours 424 U.S. cases 1 U.S. deaths 1 Open 136 Close 136 dtype: int64
# Identify Columns That Contain a Single Value and delete those columns
# summarize the number of unique values in each column
print(" \nsummarize the number of unique values in each column of dataframe:")
print(df.nunique())
summarize the number of unique values in each column of dataframe: date 439 cumCasesByPublishDate 407 cumVirusTests 346 cumDailyNsoDeathsByDeathDate 384 cumPeopleVaccinatedCompleteByPublishDate 84 cumAdmissions 371 FTSE100 Open Price 296 Close Price 297 High Price 294 Low Price 292 Volume 297 temperature max degC 14 tmin degC 15 rain mm 15 sun hours 15 U.S. cases 418 U.S. deaths 399 Open 299 Close 298 dtype: int64
# calculate duplicates
dups = df.duplicated()
# report if there are any duplicates
print("Are there any duplicates: ",dups.any())
# list all duplicate rows
print("\nList all duplicate rows: \n",df[dups])
Are there any duplicates: False List all duplicate rows: Empty DataFrame Columns: [date, cumCasesByPublishDate, cumVirusTests, cumDailyNsoDeathsByDeathDate, cumPeopleVaccinatedCompleteByPublishDate, cumAdmissions, FTSE100 Open Price, Close Price, High Price, Low Price, Volume, temperature max degC, tmin degC, rain mm, sun hours, U.S. cases, U.S. deaths, Open, Close] Index: []
print('Dataframe shape before drop duplicates: ', df.shape)
# delete duplicate rows
df.drop_duplicates(inplace=True)
print('Dataframe shape after drop duplicates: ', df.shape)
Dataframe shape before drop duplicates: (439, 19) Dataframe shape after drop duplicates: (439, 19)
# Breakdown of Total missing values (Null values) for each feature
# Count total NaN at each column in a DataFrame
print(" \nCount total missing values (NaN) at each column in cleaned dataFrame :")
df.isnull().sum()
Count total missing values (NaN) at each column in cleaned dataFrame :
date 0 cumCasesByPublishDate 10 cumVirusTests 93 cumDailyNsoDeathsByDeathDate 24 cumPeopleVaccinatedCompleteByPublishDate 355 cumAdmissions 68 FTSE100 Open Price 142 Close Price 142 High Price 142 Low Price 142 Volume 142 temperature max degC 424 tmin degC 424 rain mm 424 sun hours 424 U.S. cases 1 U.S. deaths 1 Open 136 Close 136 dtype: int64
# using mean to replace NaN for weather related data
df['temperature max degC'].fillna(int(df['temperature max degC'].mean()), inplace=True)
df['tmin degC'].fillna(int(df['tmin degC'].mean()), inplace=True)
df['rain mm'].fillna(int(df['rain mm'].mean()), inplace=True)
df['sun hours'].fillna(int(df['sun hours'].mean()), inplace=True)
df.isnull().sum()
date 0 cumCasesByPublishDate 10 cumVirusTests 93 cumDailyNsoDeathsByDeathDate 24 cumPeopleVaccinatedCompleteByPublishDate 355 cumAdmissions 68 FTSE100 Open Price 142 Close Price 142 High Price 142 Low Price 142 Volume 142 temperature max degC 0 tmin degC 0 rain mm 0 sun hours 0 U.S. cases 1 U.S. deaths 1 Open 136 Close 136 dtype: int64
#Take value from previous row
df['FTSE100 Open Price'].fillna(method='ffill', inplace=True)
df['Close Price'].fillna(method='ffill', inplace=True)
df['High Price'].fillna(method='ffill', inplace=True)
df['Low Price'].fillna(method='ffill', inplace=True)
df['Volume'].fillna(method='ffill', inplace=True)
df["Open"].fillna(method='ffill', inplace=True)
df["Close"].fillna(method='ffill', inplace=True)
# U.S. cases and death: Covid19 outbreak started in 21 Jan 2020, last row (last day) missing value use previous row method
# df['U.S. cases'].fillna(method='ffill', inplace=True)
# df['U.S. deaths'].fillna(method='ffill', inplace=True)
# #Take value from next row
# df['FTSE100 Open Price'].fillna(method='bfill', inplace=True)
# df['Close Price'].fillna(method='bfill', inplace=True)
# df['High Price'].fillna(method='bfill', inplace=True)
# df['Low Price'].fillna(method='bfill', inplace=True)
# df['Volume'].fillna(method='bfill', inplace=True)
# print(" \nCount total missing values (NaN) at each column in cleaned dataFrame :")
df.isnull().sum()
# #use this method for
# #FTSE datasets
#covid Cases
date 0 cumCasesByPublishDate 10 cumVirusTests 93 cumDailyNsoDeathsByDeathDate 24 cumPeopleVaccinatedCompleteByPublishDate 355 cumAdmissions 68 FTSE100 Open Price 9 Close Price 9 High Price 9 Low Price 9 Volume 9 temperature max degC 0 tmin degC 0 rain mm 0 sun hours 0 U.S. cases 1 U.S. deaths 1 Open 0 Close 0 dtype: int64
# The FTSE data has missing values at the start, which is why we can't forward fill. Thus, we backfill
print("First few rows of FTSE data\n", df[["FTSE100 Open Price", "Close Price","High Price","Low Price"]].head(15))
df['FTSE100 Open Price'].fillna(method='bfill', inplace=True)
df['Close Price'].fillna(method='bfill', inplace=True)
df['High Price'].fillna(method='bfill', inplace=True)
df['Low Price'].fillna(method='bfill', inplace=True)
df['Volume'].fillna(method='bfill', inplace=True)
df["Open"].fillna(method='bfill', inplace=True)
df["Close"].fillna(method='bfill', inplace=True)
df.isnull().sum()
First few rows of FTSE data
FTSE100 Open Price Close Price High Price Low Price
0 NaN NaN NaN NaN
1 NaN NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
5 NaN NaN NaN NaN
6 NaN NaN NaN NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 7483.57 7381.96 7483.57 7357.62
10 7381.96 7286.01 7398.34 7275.03
11 7381.96 7286.01 7398.34 7275.03
12 7381.96 7286.01 7398.34 7275.03
13 7286.01 7326.31 7359.98 7285.16
14 7326.31 7439.82 7442.08 7326.31
date 0 cumCasesByPublishDate 10 cumVirusTests 93 cumDailyNsoDeathsByDeathDate 24 cumPeopleVaccinatedCompleteByPublishDate 355 cumAdmissions 68 FTSE100 Open Price 0 Close Price 0 High Price 0 Low Price 0 Volume 0 temperature max degC 0 tmin degC 0 rain mm 0 sun hours 0 U.S. cases 1 U.S. deaths 1 Open 0 Close 0 dtype: int64
# cumCasesByPublishDate: Covid19 outbreak in UK started in 31 Jan 2020 , first 10 NaN values are 0 Covid19 cases.
df['cumCasesByPublishDate'] = df['cumCasesByPublishDate'].fillna(0, limit=10)
# cumDailyNsoDeathsByDeathDate: First Covid19 death report in started in 31 Jan 2020 , first 10 NaN values are 0 Covid19 cases.
df['cumDailyNsoDeathsByDeathDate'] = df['cumDailyNsoDeathsByDeathDate'].fillna(0, limit=9)
df["cumVirusTests"] = df["cumVirusTests"].fillna(0, limit=1)
df["cumPeopleVaccinatedCompleteByPublishDate"] = df["cumPeopleVaccinatedCompleteByPublishDate"].fillna(0, limit=1)
df["cumAdmissions"] = df["cumAdmissions"].fillna(0, limit=1)
df.isnull().sum()
date 0 cumCasesByPublishDate 0 cumVirusTests 92 cumDailyNsoDeathsByDeathDate 15 cumPeopleVaccinatedCompleteByPublishDate 354 cumAdmissions 67 FTSE100 Open Price 0 Close Price 0 High Price 0 Low Price 0 Volume 0 temperature max degC 0 tmin degC 0 rain mm 0 sun hours 0 U.S. cases 1 U.S. deaths 1 Open 0 Close 0 dtype: int64
df.head(10)
| date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE100 Open Price | Close Price | High Price | Low Price | Volume | temperature max degC | tmin degC | rain mm | sun hours | U.S. cases | U.S. deaths | Open | Close | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-21 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.899994 | 331.299988 |
| 1 | 2020-01-22 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 332.239990 | 331.339996 |
| 2 | 2020-01-23 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.630005 | 331.720001 |
| 3 | 2020-01-24 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 2.0 | 0.0 | 332.440002 | 328.769989 |
| 4 | 2020-01-25 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 3.0 | 0.0 | 332.440002 | 328.769989 |
| 5 | 2020-01-26 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 5.0 | 0.0 | 332.440002 | 328.769989 |
| 6 | 2020-01-27 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 5.0 | 0.0 | 323.029999 | 323.500000 |
| 7 | 2020-01-28 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 5.0 | 0.0 | 325.059998 | 326.890015 |
| 8 | 2020-01-29 | 0.0 | NaN | 0.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 5.0 | 0.0 | 328.380005 | 326.619995 |
| 9 | 2020-01-30 | 0.0 | NaN | 1.0 | NaN | NaN | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 6.0 | 0.0 | 324.359985 | 327.679993 |
df['date'] = df['date'].astype('str')
# Make a copy
mdf = df.copy()
mdf['date'] = pd.to_datetime(mdf['date'])
mdf.index = mdf['date']
mdf["Volume"] = mdf["Volume"].replace(0, np.nan)
del mdf['date']
# Interpolate
qqdf = mdf.interpolate(axis=0, limit_direction="both")
plt.plot(qqdf)
df = qqdf
df.reset_index(inplace=True)
# Automatic Outliers Detection
#https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_anomaly_comparison.html
st0 = time.time()
stm = st0
# Must remove date as it is of a type that cannot be interpreted by SKLearn
d = df.drop("date", axis=1)
cn = list(d)
#change any non-numeric y value to NaN
#delete any row where all x-variables are missing
d = d[cn].dropna(how='all')
#delete any column of constant values:
d = d[d.columns[d.nunique() > 1]]
#delete any non-numeric column and any column with any missing value:
for i in list(d):
if pd.to_numeric(d[i], errors='coerce').isnull().any():
del d[i]
# #random subsetting of rows: #only for demo purposes
# d = d.sample(10000)
r = len(d)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
print('Preparing data of', r, 'rows &', d.shape[1], 'columns took',
'%.2f' % ((time.time() - stm) / 60), 'mins.')
stm = time.time()
#Isolation Forest
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
from sklearn.ensemble import IsolationForest
g = IsolationForest(n_jobs=-1).fit_predict(d).tolist()
print('\nNumber of IsolationForest outliers:', g.count(-1))
print('... took', '%.2f' % ((time.time() - stm) / 60), 'mins.')
stm = time.time()
#Local Outlier Factor
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html
from sklearn.neighbors import LocalOutlierFactor
a0 = LocalOutlierFactor(n_jobs=-1)
a = a0.fit_predict(d).tolist()
ac = a.count(-1) #keep for EllipticEnvelope
an = a0.negative_outlier_factor_ #use this size info to rank outliers later
print('\nNumber of LocalOutlierFactor outliers:', ac)
print('... took', '%.2f' % ((time.time() - stm) / 60), 'mins.')
stm = time.time()
#Elliptic Envelope for Gaussian distribution
#https://scikit-learn.org/stable/modules/generated/sklearn.covariance.EllipticEnvelope.html
from sklearn.covariance import EllipticEnvelope
#-1 in 'e' identifies an outlier position:
e = EllipticEnvelope(contamination=min(0.1,ac/r)).fit_predict(d).tolist()
print('\nNumber of EllipticEnvelope outliers:', e.count(-1))
print('... took', '%.2f' % ((time.time() - stm) / 60), 'mins.')
stm = time.time()
#find shared outliers
# f = [-1 if s==-1 and t==-1 and u==-1 and v==-1 else 1 for s, t, u, v in zip(a, b, c, e)]
f = [-1 if s==-1 and t==-1 and u==-1 else 1 for s, t, u in zip(a, g, e)]
p = f.count(-1)
if p > 0:
common = [i for i, z in enumerate(f) if z == -1] #indices of outliers
q = [z for _, z in sorted(zip(an[common], common))][:100] #indices of sorted worst outliers
print('\nNumber of outliers common to all lists:', p,
'\nIndices of up to 100 sorted outliers with the worst first:', q,
'from [0 to', str(r - 1) + '].')
print('\nProportion of data outlying:', '%.5f' % (p / r))
else:
print('\nNo outlier common to all lists.')
print('\nOutliers detection took', '%.2f' % ((time.time() - st0) / 60), 'mins.')
Preparing data of 439 rows & 18 columns took 0.00 mins. Number of IsolationForest outliers: 49 ... took 0.01 mins. Number of LocalOutlierFactor outliers: 21 ... took 0.00 mins. Number of EllipticEnvelope outliers: 21 ... took 0.00 mins. No outlier common to all lists. Outliers detection took 0.01 mins.
# Instead of dropping, we convert to NaNs so that the missing values can be interpolated.
try:
df.loc[common, 1:] = np.nan
except:
pass; # No common outliers
df
| date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE100 Open Price | Close Price | High Price | Low Price | Volume | temperature max degC | tmin degC | rain mm | sun hours | U.S. cases | U.S. deaths | Open | Close | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-21 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 0.000000 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.899994 | 331.299988 |
| 1 | 2020-01-22 | 0.0 | 5.913656e+03 | 0.0 | 1.102532e+03 | 78.612903 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 332.239990 | 331.339996 |
| 2 | 2020-01-23 | 0.0 | 1.182731e+04 | 0.0 | 2.205065e+03 | 157.225806 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.630005 | 331.720001 |
| 3 | 2020-01-24 | 0.0 | 1.774097e+04 | 0.0 | 3.307597e+03 | 235.838710 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 2.0 | 0.0 | 332.440002 | 328.769989 |
| 4 | 2020-01-25 | 0.0 | 2.365462e+04 | 0.0 | 4.410130e+03 | 314.451613 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 3.0 | 0.0 | 332.440002 | 328.769989 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 2021-03-30 | 4341736.0 | 1.228242e+08 | 150116.0 | 4.125884e+06 | 457398.000000 | 6736.17 | 6772.12 | 6792.23 | 6729.46 | 654363968.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30416970.0 | 550500.0 | 394.420013 | 394.730011 |
| 435 | 2021-03-31 | 4345788.0 | 1.241472e+08 | 150116.0 | 4.513458e+06 | 457398.000000 | 6772.12 | 6713.63 | 6775.67 | 6713.63 | 837940608.0 | 12.1 | 2.8 | 37.0 | 116.3 | 30485232.0 | 551638.0 | 395.339996 | 396.329987 |
| 436 | 2021-04-01 | 4350266.0 | 1.241472e+08 | 150116.0 | 4.958874e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30562856.0 | 552593.0 | 398.399994 | 400.609985 |
| 437 | 2021-04-02 | 4353668.0 | 1.241472e+08 | 150116.0 | 5.205505e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
| 438 | 2021-04-03 | 4357091.0 | 1.241472e+08 | 150116.0 | 5.381745e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
439 rows × 19 columns
df['date'] = df['date'].astype('str')
#Rename Columns to be more readable
df.rename(columns={'FTSE100 Open Price': 'FTSE_Open_Price',
'Close Price': 'FTSE_Close_Price',
'High Price': 'FTSE_High_Price',
'Low Price': 'FTSE_Low_Price',
'Volume': 'FTSE_Volume',
'temperature max degC': 'Max_Temperature_DegC',
'tmin degC':'Min_Temperature_DegC',
'rain mm':'Rainfall_mm',
'sun hours':'Sun_Hours',
'U.S. cases':'US_Covid_Cases',
'U.S. deaths':'US_Covid_Deaths',
'Open':'SPY_Open_Price',
'Close':'SPY_Close_Price'}, inplace=True)
df.to_csv('cleaned_data.csv')
df.shape
(439, 19)
df
| date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE_Open_Price | FTSE_Close_Price | FTSE_High_Price | FTSE_Low_Price | FTSE_Volume | Max_Temperature_DegC | Min_Temperature_DegC | Rainfall_mm | Sun_Hours | US_Covid_Cases | US_Covid_Deaths | SPY_Open_Price | SPY_Close_Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-21 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 0.000000 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.899994 | 331.299988 |
| 1 | 2020-01-22 | 0.0 | 5.913656e+03 | 0.0 | 1.102532e+03 | 78.612903 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 332.239990 | 331.339996 |
| 2 | 2020-01-23 | 0.0 | 1.182731e+04 | 0.0 | 2.205065e+03 | 157.225806 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.630005 | 331.720001 |
| 3 | 2020-01-24 | 0.0 | 1.774097e+04 | 0.0 | 3.307597e+03 | 235.838710 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 2.0 | 0.0 | 332.440002 | 328.769989 |
| 4 | 2020-01-25 | 0.0 | 2.365462e+04 | 0.0 | 4.410130e+03 | 314.451613 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 3.0 | 0.0 | 332.440002 | 328.769989 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 2021-03-30 | 4341736.0 | 1.228242e+08 | 150116.0 | 4.125884e+06 | 457398.000000 | 6736.17 | 6772.12 | 6792.23 | 6729.46 | 654363968.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30416970.0 | 550500.0 | 394.420013 | 394.730011 |
| 435 | 2021-03-31 | 4345788.0 | 1.241472e+08 | 150116.0 | 4.513458e+06 | 457398.000000 | 6772.12 | 6713.63 | 6775.67 | 6713.63 | 837940608.0 | 12.1 | 2.8 | 37.0 | 116.3 | 30485232.0 | 551638.0 | 395.339996 | 396.329987 |
| 436 | 2021-04-01 | 4350266.0 | 1.241472e+08 | 150116.0 | 4.958874e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30562856.0 | 552593.0 | 398.399994 | 400.609985 |
| 437 | 2021-04-02 | 4353668.0 | 1.241472e+08 | 150116.0 | 5.205505e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
| 438 | 2021-04-03 | 4357091.0 | 1.241472e+08 | 150116.0 | 5.381745e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
439 rows × 19 columns
#read csv file as a pandas dataframe
df_clean = pd.read_csv('cleaned_data.csv') #read csv file
print('Imported Dataset from cleaned_data.csv:')
df_clean #show dataframe
Imported Dataset from cleaned_data.csv:
| Unnamed: 0 | date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE_Open_Price | FTSE_Close_Price | FTSE_High_Price | FTSE_Low_Price | FTSE_Volume | Max_Temperature_DegC | Min_Temperature_DegC | Rainfall_mm | Sun_Hours | US_Covid_Cases | US_Covid_Deaths | SPY_Open_Price | SPY_Close_Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2020-01-21 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 0.000000 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.899994 | 331.299988 |
| 1 | 1 | 2020-01-22 | 0.0 | 5.913656e+03 | 0.0 | 1.102532e+03 | 78.612903 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 332.239990 | 331.339996 |
| 2 | 2 | 2020-01-23 | 0.0 | 1.182731e+04 | 0.0 | 2.205065e+03 | 157.225806 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.630005 | 331.720001 |
| 3 | 3 | 2020-01-24 | 0.0 | 1.774097e+04 | 0.0 | 3.307597e+03 | 235.838710 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 2.0 | 0.0 | 332.440002 | 328.769989 |
| 4 | 4 | 2020-01-25 | 0.0 | 2.365462e+04 | 0.0 | 4.410130e+03 | 314.451613 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 3.0 | 0.0 | 332.440002 | 328.769989 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 434 | 2021-03-30 | 4341736.0 | 1.228242e+08 | 150116.0 | 4.125884e+06 | 457398.000000 | 6736.17 | 6772.12 | 6792.23 | 6729.46 | 654363968.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30416970.0 | 550500.0 | 394.420013 | 394.730011 |
| 435 | 435 | 2021-03-31 | 4345788.0 | 1.241472e+08 | 150116.0 | 4.513458e+06 | 457398.000000 | 6772.12 | 6713.63 | 6775.67 | 6713.63 | 837940608.0 | 12.1 | 2.8 | 37.0 | 116.3 | 30485232.0 | 551638.0 | 395.339996 | 396.329987 |
| 436 | 436 | 2021-04-01 | 4350266.0 | 1.241472e+08 | 150116.0 | 4.958874e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30562856.0 | 552593.0 | 398.399994 | 400.609985 |
| 437 | 437 | 2021-04-02 | 4353668.0 | 1.241472e+08 | 150116.0 | 5.205505e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
| 438 | 438 | 2021-04-03 | 4357091.0 | 1.241472e+08 | 150116.0 | 5.381745e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
439 rows × 20 columns
## Generalized Linear Model (GLM) with transformed variables & interaction variables (collinearity issue considered)
#For comparing Normal & GLM error distributions.
#algorithm is inefficient but clearer for instructional purposes
#favors fit with Xs' p-values < 0.05, and smaller rank-deficiency (number_of_Xs - Df_Model)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
import pandas as pd
#set maximum window width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) #change code window width to max
pd.options.display.max_columns = 0 #change output window width to max
counter = 1
#Iterate through 0 to 10, iterate through the GLM models
for i in range(0,10):
print("\n \n \n \n \n ")
print("========================================================== Next GLM Model ==============================================================")
print("\n \n \n \n \n ")
print("counter = ",counter)
m2 = str(counter)
counter += 1
# 1 out of the 2 following lines should be active:
# d = '' #means nanes96 will be used
d = 1 #means use specified dataset instead of the default nanes96: d=''
if d == '':
pass
else:
#d = 1 case:
#Use cleaned_data
dt = df_clean.copy() #specifying 1st column as row labels (called 'index')
#handle special variabes
yname = 'FTSE_Close_Price' #target variable FTSE CLose Price
import regex as re
dt.rename(columns=lambda x: re.sub('\W', '_', x), inplace=True) #replace ' ' by '_' in variable names
yname = re.sub('\W', '_', yname)
#delete any row with any missing value
dt.dropna(how='any', inplace=True)
if yname != dt.columns[0]:
dt = dt[[yname] + list(dt.columns.drop(yname))]
#m2 = input('1: OLS (default), 2: Poisson, 3: Logit, 4: Probit, 5: Gamma, 6: InverseGaussian, 7: NegativeBinomial,' +
#' 8: Tweedie, 9: Gaussian? ').strip()
if m2 == '2':
#https://www.statsmodels.org/stable/glm.html
m = 'Poisson'
dist = sm.families.Poisson()
elif m2 == '3':
m = 'Logit'
dist = sm.families.Binomial()
elif m2 == '4':
m = 'Probit'
#http://web.pdx.edu/~crkl/ceR/Python/example8_1.py
dist = sm.families.Binomial(sm.genmod.families.links.probit)
elif m2 == '5':
m = 'Gamma'
dist = sm.families.Gamma()
elif m2 == '6':
m = 'InverseGaussian'
dist = sm.families.InverseGaussian()
elif m2 == '7':
m = 'NegativeBinomial'
dist = sm.families.NegativeBinomial()
elif m2 == '8':
m = 'Tweedie'
dist = sm.families.Tweedie()
elif m2 == '9':
m = 'GLM Gaussian'
dist = sm.families.Gaussian()
else:
# m2 == 1 or ''
m = 'Normal'
print('GLM Model Fitted = ',m)
ytype = type('a')
if m == 'Normal':
#there is a version of OLS that also requires endog and exog like GLM:
#http://statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html
#df has y followed by Xs:
if d == '':
df = pd.concat([dt.endog, dt.exog], axis=1) #a DataFrame
else:
df = dt
y = yname
else:
#df will not have y, but a column of 1s:
if d == '':
df = sm.add_constant(dt.exog) #add column of 1 to the left of dt.exog to form DataFrame
y = dt.endog #PID: 0 to 6: Party IDentification of respondent (shades of Democrat or Republican)
else:
df = sm.add_constant(dt.iloc[:, 1:])
y = dt.iloc[:, 0]
ytype = type(y)
#m3 = input('1: Reproducible output (input any integer except 2), 2: Random train-test data split [default: 1]? ').strip()
m3 = 1
if m3 != '2':
try:
if m3 == '':
m3 = 1
else:
m3 = int(m3)
print(m3, 'entered.')
except:
m3 = 1 #can be changed to any integer for reproducible randomization
print(m3, 'assumed.')
print('\nAssuming', m, 'error distribution.')
def delcorr(df):
#delete any x too highly correlated with another x, to avoid collinearity
#corr(Xs, y) ranked:
# corv = pd.DataFrame() #start empty dataframe for corr(Xs, y) to come
# for x in list(df)[1:]:
# #during 1st time thru loop: new column, with label, created in empty dataframe:
# #during subsequent time thru loop: new row, with row label, added to dataframe:
# corv.loc[x, yname] = df[x].corr(df[yname] if m == 'Normal' else y)
# corv = corv.loc[abs(corv).sort_values([yname]).index, :] #corr(Xs, y) ranked
corv = df.iloc[:, 1:].corrwith(df[yname] if m == 'Normal' else y).rename(yname).sort_values(key=abs).to_frame()
delta = 0.005 #corr difference lower limit
dl2 = []
icorr = True
while icorr:
a = abs(corv).diff() <= delta #adjacent rows with similar abs(corr(Xs, y))
colname = list(df)[1:]
dl = []
print('\nX pairs with correlations >', 1 - delta, ':')
for b in range(1, a.shape[0]):
if a.iloc[b, 0]:
if abs(df[a.index[b - 1]].corr(df[a.index[b]])) > 1 - delta:
#deleting 1 X from correlated pair:
dv0 = a.index[b - 1]
dv1 = a.index[b]
#neither should already be deleted:
if not (dv0 in dl) and not (dv1 in dl):
#delete x with rather lower corr(x, y):
if abs(corv.loc[dv0, y if type(y) == type('a') else y.name]
) - abs(corv.loc[dv1, y if type(y) == type('a') else y.name]) >= delta:
d = dv1
elif len(dv0) < len(dv1): #delete x with longer name:
d = dv1
else:
d = dv0
dl.append(d) #for en masse deletion later
corv.drop([d], axis=0, inplace=True) #delete from column of corr with y
print(dv0,',',dv1)
if len(dl) > 0:
df.drop(axis=1, columns=dl, inplace=True) #variables deleted en masse
dl2 = dl2 + dl #keep for real deletion later
print('\n' + str(len(dl)), 'variables considered for deletion:')
print('\n'.join([str(x) for x in dl]))
else:
print('(no more)')
icorr = False
return dl2
#delete collinear Xs:
dl2 = delcorr(df)
#df.drop(axis=1, columns=dl2, inplace=True) #collinear Xs deleted en masse #not necessary since df operated on directly
if len(dl2) > 0:
print('\n' + str(len(dl2)) + ' variables deleted.')
#transform all Xs into either square & square-root or cube & cube-root using np.cbrt()
trf = ['_sqar', '_sqrt', '_cube', '_cbrt']
import numpy as np
for i in list(df)[1:]:
#excluded either y or column of 1s
failed = False
try:
#searching for -ve values:
df[i + trf[1]] = np.sqrt(df[i])
if df[i + trf[1]].isnull().any():
#bug reported by Sharifah
failed = True
del df[i + trf[1]]
else:
df[i + trf[0]] = df[i] ** 2.
except:
failed = True
if failed:
try:
#searching for non-numeric
df[i + trf[2]] = df[i] ** 3.
df[i + trf[3]] = np.cbrt(df[i])
except:
#column cannot be transformed
#delete non-numeric column (with no questions asked!):
df.drop(i, axis=1, inplace=True) #remove any row with any NaN
#only numeric columns left
#delete collinear Xs:
df0 = df.copy()
dl2 = delcorr(df0)
dl2 = [x for x in dl2 if x[-5:] in trf]
if len(dl2) > 0:
df.drop(axis=1, columns=dl2, inplace=True) #collinear transformed variables deleted en masse
print('\n' + str(len(dl2)) + ' transformed variables deleted.')
from sklearn.model_selection import train_test_split
#split into training & testing sets
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 1:] if m == 'Normal' else df,
df[y] if m == 'Normal' else y, test_size=.2,
random_state=(None if m3=='2' else m3)) #set to an integer (here m3) to get reproducible output
#sort columns by absolute correlation with y, so may preferably delete last column if regression fails:
x_train = x_train[x_train.corrwith(y_train).sort_values(ascending=False, na_position='first', key=abs).index]
print('\nFit using', ('' if m3 == '2' else 'reproducible ') + 'random 80% (x_train & y_train) of data rows:')
#first do OLS on untransformed Xs:
df = pd.concat([y_train, x_train if m == 'Normal' else x_train.iloc[:, 1:]], axis=1)
xpure = [x for x in list(df) if x[-5:] not in trf]
numx = len(xpure) - 1
print('\nOLS fit including only', numx, 'untransformed Xs:')
#initialize for adj-R2:
ddf = np.inf #deficiency in degree of freedom = rank deficiency
maxR2 = -np.inf
bmodeleq = ''
bic0 = np.inf #bic kept by lowest overall rank deficiency
bic5 = bic0 #bic kept by best model with p-values < 0.05
bicd = bic0 #bic kept by lowest rank deficiency
bbic = bic0 #best bic kept by adj-R2
p05 = False #once found all Xs' p-values < 0.05
modeleq = ' + '.join(list(xpure)).replace('+', '~', 1)
from statsmodels.formula.api import ols
try:
out = ols(modeleq, df).fit()
print(out.summary2())
if numx > 1:
print("\nDescending order of", numx, "X's significance, assuming Normal error distribution:")
print('\n'.join(list(abs(out.tvalues[1:]).sort_values(0, ascending=False).index)))
#if the single best variable isn't high in above ranking, collinearity might be an issue
dfm = int(out.df_model)
ddf = numx - dfm #rank deficiency
maxR2 = out.rsquared_adj
bmodeleq = modeleq
print('\n' + 'Rank deficiency =', str(ddf) + ': Df Model (' + str(dfm) + ') is',
('less than' if ddf > 0 else 'same as'), 'number of Xs (' + str(numx) + ').')
except:
pass
print('\n'+ m, 'fit including transformed Xs:')
if m != 'Normal':
#undo above ols:
ddf = np.inf
maxR2 = -np.inf
bmodeleq = ''
df = x_train
y = y_train
#initialize for rank deficiency:
ddfd = ddf #best rank deficiency
R2df = maxR2 #R2 for best rank deficiency
modeleqdf = bmodeleq #modeleq for best rank deficiency
#initialize for overall rank deficiency
ddf0 = ddfd
R2df0 = R2df
modeleqdf0 = modeleqdf
bddf = ddfd #rank deficiency for best adj-R2 model
#initialize for z-stat p-values < 0.05:
ddf5 = np.inf #rank deficiency for best model with p-values < 0.05
R205 = -np.inf #adj-R2 for best model with p-values < 0.05
modeleq05 = '' #modeleq for best model with p-values < 0.05
df0 = df.copy() #kept for inclusion of interaction variables later
#perform feature selection using adjusted R2
#model equation actually not used by GLM:
modeleq = ' + '.join(list(df)).replace('+', '~', 1)
#print(modeleq)
numx = df.shape[1] - 1
x1x2 = False #interaction variables not yet included
while True:
if m == 'Normal':
#https://www.statsmodels.org/stable/generated/statsmodels.formula.api.ols.html
out = ols(modeleq, df).fit()
R2 = out.rsquared_adj
else:
#GLM distribution
try:
#https://www.statsmodels.org/stable/generated/statsmodels.genmod.generalized_linear_model.GLM.html
out = sm.GLM(y, df, family=dist).fit()
R2 = 1 - (1 - y.corr(out.fittedvalues)**2) * (out.nobs - 1) / out.df_resid #pseudo adjusted r2
if R2 != R2 and out.fittedvalues.isna().sum() == 0:
R2 = -np.inf
#R2 = -out.bic
except:
#GLM failed! do ols for this round, just to delete 1 x:
try:
out = ols(' + '.join(list(df)).replace('+', '~', 1), df).fit() #do OLS instead
except:
pass
R2 = -np.inf
try:
maxp = max(out.pvalues[1:])
dfm = int(out.df_model)
except:
maxp = 1
dfm = 0
ddf = numx - dfm #rank deficiency
#see if a better model is found:
try:
if R2 >= maxR2 and out.fittedvalues.isna().sum() == 0:
maxR2 = R2
bmodeleq = modeleq
bddf = min(bddf, ddf)
ddf0 = ddf #best overall rank deficiency
if maxR2 == -np.inf:
bbic = out.bic
if maxp >= 0.05 and not p05:
#reset z-stat p-value criterion:
R205 = -np.inf
modeleq05 = ''
#reset rank deficiency criterion:
R2df = R205
modeleqdf = ''
ddfd = bddf #reset deficient df
else:
p05 = True
#if m != 'Normal':
# df1 = df.copy()
#see if a model is found with reduced overall rank deficiency:
if ddf < ddf0 or (ddf == ddf0 and R2 > R2df0):
R2df0 = R2
modeleqdf0 = modeleq
ddf0 = ddf #best overall rank deficiency
if maxR2 == -np.inf:
bic0 = out.bic
#see if a better model is found with max(z-stat p-value) < .05:
if maxp < .05 and (R2 > R205 or modeleq05 == ''):
R205 = R2
modeleq05 = modeleq
ddf5 = min(ddf5, ddf) #rank deficiency for best model with p-values < .05
if maxR2 == -np.inf:
bic5 = out.bic
#see if a model is found with reduced rank-deficiency:
if ddf < ddfd or (ddf == ddfd and R2 > R2df):
R2df = R2
modeleqdf = modeleq
ddfd = min(ddfd, ddf) #best rank deficiency
if maxR2 == -np.inf:
bicd = out.bic
except:
pass
print('\nAdjusted R² =', R2, ', max(X p-value) =', maxp, ', rank deficiency =', ddf, ', for', numx, 'Xs.')
if numx == 1:
print('Variable left:', modeleq[modeleq.find('~') + 2 :])
if x1x2:
#one xvar left
#get out of 'while' loop:
break
else:
if maxR2 == -np.inf and out.fittedvalues.isna().sum() > 0:
print('\n*** Y variable', yname if m == 'Normal' else y.name, 'might not work with', m, 'distribution.')
#use all Xs before deletion:
bmodeleq = ' + '.join(list(df0)).replace('+', '~', 1)
else:
#see if best model with all z-stat p-values < 0.05 is smaller than best model by adjusted R2:
if (R205 > -np.inf and len(modeleq05) < len(bmodeleq)) or (
R205 == -np.inf and (maxR2 == -np.inf or (len(modeleq05) > 0 and len(modeleq05) < len(bmodeleq)))):
bmodeleq = modeleq05
maxR2 = R205
#bddf = min(bddf, ddf5)
bddf = ddf5
if maxR2 == -np.inf:
bmodeleq0 = bmodeleq
bddf0 = bddf
bbic = bic5 #best bic
bic5 = np.inf #re-initialize
#see if model with smallest rank-deficiency is smaller than best model so far:
if (R2df > -np.inf and len(modeleqdf) < len(bmodeleq)) or (
R2df == -np.inf and (maxR2 == -np.inf or (len(modeleqdf) > 0 and len(modeleqdf) < len(bmodeleq)))):
bmodeleq = modeleqdf
maxR2 = R2df
#bddf = min(bddf, ddfd)
bddf = ddfd
if maxR2 == -np.inf:
bmodeleq0 = bmodeleq
bddf0 = bddf
bbic = bicd #best bic
bicd = np.inf #re-initialize
if maxR2 == -np.inf:
#reset z-stat p-value criterion:
R205 = -np.inf
modeleq05 = ''
#reset rank deficiency criterion:
R2df = R205
modeleqdf = ''
ddfd = np.inf #reset deficient df
#add interaction variables for original untransformed variables in best model so far
numx = bmodeleq.count('+') + 1
if numx == 1:
bmodeleq = ' + '.join(list(df0)).replace('+', '~', 1)
numx = bmodeleq.count('+') + 1
print('\nRestarting from best model (with', numx, 'Xs & Adjusted R² =', str(maxR2) + ') found so far...')
colname = bmodeleq.replace('~', '+').split(' + ')
df = df0[colname]
colname = colname[1:] #remove y or 'const'
x_test = x_test[colname]
# for i in range(numx):
# #look for 1st transformed variable:
# if colname[i][-5:] in trf:
# i = i - 1
# #colname[i] is the last untransformed x
# break
# #actually, nothing to do if i<=0
# print('\nAdding', int((i + 1) * i / 2), '2-way interactions among', i + 1,
# 'untransformed variables in best model found so far:')
# for j in range(i):
# #untransformed x in colname up to [i]
# for k in range(j + 1, i + 1):
# a = colname[j] + '_x_' + colname[k]
# print(a)
# df[a] = df[colname[j]] * df[colname[k]]
# x_test[a] = x_test[colname[j]] * x_test[colname[k]]
xpure = [x for x in colname if x[-5:] not in trf] #untransformed x names
i = len(xpure) - 1
#actually, nothing to do if i<=0
print('\nAdding', int((i + 1) * i / 2), '2-way interactions among', i + 1,
'untransformed variables in best model found so far:')
for j in range(i):
#untransformed x in colname up to [i]
for k in range(j + 1, i + 1):
a = xpure[j] + '_x_' + xpure[k]
print(a)
df[a] = df[xpure[j]] * df[xpure[k]]
x_test[a] = x_test[xpure[j]] * x_test[xpure[k]]
df0 = df.copy()
#delete collinear Xs introduced:
dl2 = delcorr(df)
dl2 = [x for x in dl2 if x.find('_x_') != -1] #only interaction variables kept
if len(dl2) > 0:
df0.drop(axis=1, columns=dl2, inplace=True) #collinear interaction variables deleted en masse, for real
x_test.drop(axis=1, columns=dl2, inplace=True)
#remaining Xs may be collinear
print('\n' + str(len(dl2)) + ' interaction variables deleted.')
#potential collinearity issues handled
#sort columns by absolute correlation with y, so may delete last column if regression fails:
#df0 = df0[df0.corrwith(y_train).sort_values(ascending=False, na_position='first', key=abs).index]
df0 = df0[[df0.columns[0]] + list(df0.iloc[:, 1:].corrwith(y_train)
.sort_values(ascending=False, na_position='first', key=abs).index)]
modeleq = ' + '.join(list(df0)).replace('+', '~', 1)
numx = df0.shape[1] - 1
if maxR2 == -np.inf:
bddf = np.inf
ddf5 = bddf
ddfd = bddf
p05 = False
x1x2 = True #interaction variables already included
#beyond-pairwise collinearity may still be introduced with the interaction variables
df = df0.copy() #ready for continuing deletion
continue
#identify X variable to delete by finding the one with smallest abs(t-stat):
t = out.tvalues[1:]
try:
xdrop = list(t[abs(t) == min(abs(t))].index)[-1]
except:
xdrop = list(t.index)[-1]
print('Variable to drop:', xdrop)
try:
df.drop(xdrop, axis=1, inplace=True)
except:
pass
modeleq = ' + '.join(list(df)).replace('+', '~', 1)
numx = numx - 1
#see if best model with all z-stat p-values < 0.05 is smaller than best model by adjusted R2:
if (R205 > -np.inf and len(modeleq05) < len(bmodeleq)) or (
R205 == -np.inf and (maxR2 == -np.inf or (len(modeleq05) > 0 and len(modeleq05) < len(bmodeleq)))):
bmodeleq = modeleq05
maxR2 = R205
bddf = ddf5
#see if model with smallest rank-deficiency is smaller than best model so far:
if (R2df > -np.inf and len(modeleqdf) < len(bmodeleq)) or (
R2df == -np.inf and (maxR2 == -np.inf or (len(modeleqdf) > 0 and len(modeleqdf) < len(bmodeleq)))):
bmodeleq = modeleqdf
maxR2 = R2df
bddf = ddfd
if maxR2 == -np.inf and out.fittedvalues.isna().any():
#some nan in y fit
print('\n*** Y variable', yname if m == 'Normal' else y.name, 'might not work with', m, 'distribution.')
else:
try:
if bddf >= ddf0 and ((maxR2 > -np.inf and R2df0 >= maxR2) or (maxR2 == -np.inf and bic0 <= bbic)):
bmodeleq_0 = bmodeleq
maxR2_0 = maxR2
bddf_0 = bddf
#prefer smaller rank deficiency
if m == 'Normal':
out = ols(modeleqdf0, df0).fit()
else:
out = sm.GLM(y, df0[modeleqdf0.replace('~', '+').split(' + ')], family=dist).fit()
if max(out.pvalues[1:]) < 0.05:
#Xs' p-values < 0.05
bmodeleq = modeleqdf0
maxR2 = R2df0
bddf = ddf0
except:
bmodeleq = bmodeleq_0
maxR2 = maxR2_0
bddf = bddf_0
try:
if m == 'Normal':
out = ols(bmodeleq, df0).fit()
#collinearity is still entirely possible at this stage
x_test = x_test[df0.columns[1:]]
else:
#out = sm.GLM(y, df1, family=dist).fit()
#x_test = sm.add_constant(x_test)[df1.columns]
df0 = df0[bmodeleq.replace('~', '+').split(' + ')]
out = sm.GLM(y, df0, family=dist).fit()
x_test = sm.add_constant(x_test)[df0.columns]
numx = bmodeleq.count('+') + 1
print('\nBest model has', numx, 'Xs (Adjusted R² =', str(maxR2), ', rank deficiency =', str(bddf) + '):\n')
print(out.summary2())
if m == 'Normal':
print()
if numx > 1:
print("Descending order of", numx, "X's significance, assuming", m, 'error distribution:')
#print('\n'.join(list(abs(out.tvalues[1:]).sort_values(0, ascending=False).index)))
print(pd.concat([pd.concat([out.params[:1], out.tvalues[:1]], 1), pd.concat([out.params[1:], out.tvalues[1:]], 1
).sort_values(1, key=abs, ascending=False)]).rename(columns={0:"Coefficient", 1:"z-stat"}))
#if the single best variable isn't high in above ranking, collinearity might be an issue
dfm = int(out.df_model)
print('\n' + 'Rank deficiency =', str(bddf) + ': Df Model (' + str(dfm) + ') is',
('less than' if bddf > 0 else 'same as'), 'number of Xs (' + str(numx) + ').')
import matplotlib.pyplot as pl
%matplotlib inline
pl.rcParams['lines.markersize'] = 2.5
pl.rcParams['lines.linewidth'] = 1
# if m == 'Normal':
#partial leverage plots, partial regression plots, added-variable plots
#https://r-bloggers.com/2021/03/partial-regression-plots-in-julia-python-and-r
#https://www.statsmodels.org/stable/generated/statsmodels.graphics.regressionplots.plot_partregress.html
#https://www.statsmodels.org/stable/generated/statsmodels.graphics.regressionplots.plot_partregress_grid.html
from statsmodels.graphics.regressionplots import plot_partregress_grid
import math
#includes intercept; 4 plots to a row:
nr = math.ceil((numx + 1) / 4) #number of rows of plots
pl.rcParams["figure.figsize"] = (20.2, 14 / 3 * nr) #plot height depends on number of rows of plots
print('\nPartial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:')
#grid(rows, columns) for plots; fixed at 4 columns of plots per row:
plot_partregress_grid(out, grid=(nr, 4))
pl.show()
y_fit = out.fittedvalues
y_train = df.iloc[:, 0] if m == 'Normal' else y
print('\n'+ m, 'fit using', ('' if m3 == '2' else 'reproducible ') + 'random 80% (x_train & y_train) of data rows:\n')
print(' Mean Absolute Residual =', abs(y_train - y_fit).mean())
print('Root Mean Squared Residual =', np.sqrt(((y_train - y_fit) ** 2.).mean()))
r2_train = y_train.corr(y_fit) ** 2.
print(' R² =', r2_train)
y_predict = out.predict(x_test) #forecast
print('\n' + m, 'prediction using remaining 20% (x_test & y_test) of data rows:\n')
print(' Mean Absolute Error =', abs(y_test - y_predict).mean())
print('Root Mean Squared Error =', np.sqrt(((y_test - y_predict) ** 2.).mean()))
r2_test = y_test.corr(y_predict) ** 2.
print(' R² =', r2_test)
print('\nPlots of train-set fit & test-set predict:')
#plot y_train vs y_fit
#https://stackoverflow.com/questions/42818361/how-to-make-two-plots-side-by-side-using-python
# pl.rcParams["figure.figsize"] = (4.04, 4.04)
pl.rcParams["figure.figsize"] = (20.2, 4.5)
# pl.rcParams['lines.markersize'] = 3
pl.subplot(1, 4, 1) #1 row, 4 columns, plot 1
pl.title(' y_train vs y_fit, R² = ' + str(round(r2_train, 3)))
pl.scatter(y_fit, y_train, s=3)
# pl.show()
#plot y_test vs y_predict
pl.subplot(1, 4, 2) #1 row, 4 columns, plot 2
pl.title(' y_test vs y_predict, R² = ' + str(round(r2_test, 3)))
# pl.scatter(y_predict, y_test, s=3);
pl.scatter(y_predict, y_test, s=3)
pl.show()
except:
print('\n*** Y variable', yname if m == 'Normal' else y.name, 'might not work with', m, 'distribution.')
========================================================== Next GLM Model ==============================================================
counter = 1
GLM Model Fitted = Normal
1 entered.
Assuming Normal error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:46 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
Normal fit including transformed Xs:
Adjusted R² = 0.826607683992185 , max(X p-value) = 0.25796878843555066 , rank deficiency = 32 , for 45 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.8275973276398689 , max(X p-value) = 0.21735522380657735 , rank deficiency = 31 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.8250303533702403 , max(X p-value) = 0.10017061843900948 , rank deficiency = 30 , for 43 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.8481713995272524 , max(X p-value) = 0.292816358485037 , rank deficiency = 30 , for 42 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.8475613118694245 , max(X p-value) = 0.030740949362100967 , rank deficiency = 29 , for 41 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.849944337764815 , max(X p-value) = 0.19967631849630305 , rank deficiency = 29 , for 40 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.8510476355192589 , max(X p-value) = 0.24083263348139305 , rank deficiency = 29 , for 39 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.853192769760013 , max(X p-value) = 0.28671412423223436 , rank deficiency = 29 , for 38 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.8532394736969164 , max(X p-value) = 0.16777531393277273 , rank deficiency = 28 , for 37 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.8512359747705537 , max(X p-value) = 5.706183218060886e-08 , rank deficiency = 27 , for 36 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.8418231696904797 , max(X p-value) = 0.925646924043702 , rank deficiency = 26 , for 35 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.8417830683319295 , max(X p-value) = 0.012284386194111353 , rank deficiency = 25 , for 34 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.8392850213406523 , max(X p-value) = 1.320388438357376e-08 , rank deficiency = 25 , for 33 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.8225696834188813 , max(X p-value) = 0.07582257411566104 , rank deficiency = 25 , for 32 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.822450470676079 , max(X p-value) = 0.0008237513108627133 , rank deficiency = 25 , for 31 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.8171605257593518 , max(X p-value) = 1.9148664411599767e-14 , rank deficiency = 25 , for 30 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.7845232112604724 , max(X p-value) = 4.8514373487687905e-28 , rank deficiency = 25 , for 29 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.6942929828445694 , max(X p-value) = 1.5612756339242417e-21 , rank deficiency = 25 , for 28 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.9927161211382474 , max(X p-value) = 0.9476149041648358 , rank deficiency = 10 , for 27 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9927379957475767 , max(X p-value) = 0.9123044436910306 , rank deficiency = 10 , for 26 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9927410838674037 , max(X p-value) = 0.9879811551418602 , rank deficiency = 9 , for 25 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9927412135415726 , max(X p-value) = 0.9841442898537537 , rank deficiency = 8 , for 24 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9927628460234671 , max(X p-value) = 0.9908434757602849 , rank deficiency = 8 , for 23 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9927628965373916 , max(X p-value) = 0.8595330448970508 , rank deficiency = 7 , for 22 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9927629131652195 , max(X p-value) = 0.7794559846361968 , rank deficiency = 6 , for 21 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9927827033435752 , max(X p-value) = 0.8029367283925661 , rank deficiency = 6 , for 20 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9927826907031152 , max(X p-value) = 0.6837597313876718 , rank deficiency = 5 , for 19 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9927827986446083 , max(X p-value) = 0.5377814072605769 , rank deficiency = 4 , for 18 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9927971661141927 , max(X p-value) = 0.7002303075176355 , rank deficiency = 4 , for 17 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.9928166356235602 , max(X p-value) = 0.7651969475410724 , rank deficiency = 4 , for 16 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9928147317440645 , max(X p-value) = 0.4528238909384099 , rank deficiency = 3 , for 15 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9928239685351828 , max(X p-value) = 0.504383150426027 , rank deficiency = 3 , for 14 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9928145136421224 , max(X p-value) = 0.18910416899771135 , rank deficiency = 2 , for 13 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9927990640540123 , max(X p-value) = 0.07473910069592911 , rank deficiency = 2 , for 12 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9927527170941544 , max(X p-value) = 0.1612682310287071 , rank deficiency = 2 , for 11 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9927321623793126 , max(X p-value) = 0.8190658858234556 , rank deficiency = 2 , for 10 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9927522343700771 , max(X p-value) = 0.039702510766006394 , rank deficiency = 2 , for 9 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.9926792969907643 , max(X p-value) = 0.2063311762565667 , rank deficiency = 0 , for 8 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.992666425686054 , max(X p-value) = 0.17153404683707957 , rank deficiency = 0 , for 7 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9926477222985566 , max(X p-value) = 0.6512708938482358 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9926646717577665 , max(X p-value) = 0.0782270057607111 , rank deficiency = 0 , for 5 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9926197296220962 , max(X p-value) = 0.18794384119627083 , rank deficiency = 0 , for 4 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9926039794080762 , max(X p-value) = 1.0538500089819825e-21 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9903933859344672 , max(X p-value) = 1.7835644588729087e-13 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9888031058125419 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Restarting from best model (with 9 Xs & Adjusted R² = 0.9927522343700771) found so far...
Adding 10 2-way interactions among 5 untransformed variables in best model found so far:
FTSE_Low_Price_x_FTSE_High_Price
FTSE_Low_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_US_Covid_Cases
FTSE_Low_Price_x_Rainfall_mm
FTSE_High_Price_x_FTSE_Open_Price
FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_Rainfall_mm
FTSE_Open_Price_x_US_Covid_Cases
FTSE_Open_Price_x_Rainfall_mm
US_Covid_Cases_x_Rainfall_mm
X pairs with correlations > 0.995 :
US_Covid_Cases_x_Rainfall_mm , US_Covid_Cases
FTSE_Open_Price_x_US_Covid_Cases , FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases , FTSE_Low_Price_x_US_Covid_Cases
FTSE_Open_Price , FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price , FTSE_Low_Price_x_FTSE_High_Price
5 variables considered for deletion:
US_Covid_Cases_x_Rainfall_mm
FTSE_Open_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price
X pairs with correlations > 0.995 :
FTSE_High_Price , FTSE_Low_Price_x_FTSE_High_Price
1 variables considered for deletion:
FTSE_Low_Price_x_FTSE_High_Price
X pairs with correlations > 0.995 :
(no more)
6 interaction variables deleted.
Adjusted R² = 0.9929062688131765 , max(X p-value) = 0.91187728490757 , rank deficiency = 2 , for 13 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9929269367246852 , max(X p-value) = 0.6645858522393218 , rank deficiency = 2 , for 12 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.992943753375399 , max(X p-value) = 0.251083358248719 , rank deficiency = 2 , for 11 Xs.
Variable to drop: FTSE_Open_Price_x_Rainfall_mm
Adjusted R² = 0.9929371513431888 , max(X p-value) = 0.08362210896937489 , rank deficiency = 2 , for 10 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9929453349240146 , max(X p-value) = 0.12126963637580654 , rank deficiency = 2 , for 9 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9928955763004006 , max(X p-value) = 0.1773418689354735 , rank deficiency = 1 , for 8 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9928785901976791 , max(X p-value) = 0.01654210473946162 , rank deficiency = 1 , for 7 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.9927663158595111 , max(X p-value) = 0.18090162859845127 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_Low_Price_x_US_Covid_Cases
Adjusted R² = 0.9927495948884503 , max(X p-value) = 0.5525036087497635 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9927631413308635 , max(X p-value) = 7.166406152903059e-23 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9904470118677471 , max(X p-value) = 6.138440916282818e-14 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Low_Price_x_Rainfall_mm
Adjusted R² = 0.988793082498854 , max(X p-value) = 0.4074620353997429 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price_x_Rainfall_mm
Adjusted R² = 0.9888031058125419 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Best model has 6 Xs (Adjusted R² = 0.9927663158595111 , rank deficiency = 0):
Results: Ordinary least squares
==================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3639.6641
Date: 2021-04-22 13:46 BIC: 3666.6896
No. Observations: 351 Log-Likelihood: -1812.8
Df Model: 6 F-statistic: 8007.
Df Residuals: 344 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1829.5
----------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 27.4387 44.8441 0.6119 0.5410 -60.7644 115.6419
FTSE_Low_Price 1.3940 0.0385 36.1919 0.0000 1.3182 1.4698
FTSE_Open_Price -0.3814 0.0359 -10.6289 0.0000 -0.4520 -0.3109
FTSE_Low_Price_x_Rainfall_mm -0.0075 0.0006 -12.8477 0.0000 -0.0087 -0.0064
FTSE_High_Price_x_Rainfall_mm 0.0073 0.0006 12.8643 0.0000 0.0062 0.0084
FTSE_Low_Price_x_US_Covid_Cases -0.0000 0.0000 -1.3407 0.1809 -0.0000 0.0000
cumCasesByPublishDate_sqrt 0.0305 0.0215 1.4225 0.1558 -0.0117 0.0728
----------------------------------------------------------------------------------
Omnibus: 40.711 Durbin-Watson: 1.962
Prob(Omnibus): 0.000 Jarque-Bera (JB): 263.904
Skew: -0.010 Prob(JB): 0.000
Kurtosis: 7.248 Condition No.: 1810167996023
==================================================================================
* The condition number is large (2e+12). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 6 X's significance, assuming Normal error distribution:
Coefficient z-stat
Intercept 2.743875e+01 0.611870
FTSE_Low_Price 1.393996e+00 36.191877
FTSE_High_Price_x_Rainfall_mm 7.321943e-03 12.864254
FTSE_Low_Price_x_Rainfall_mm -7.535470e-03 -12.847734
FTSE_Open_Price -3.814345e-01 -10.628935
cumCasesByPublishDate_sqrt 3.054065e-02 1.422510
FTSE_Low_Price_x_US_Covid_Cases -2.897631e-10 -1.340703
Rank deficiency = 0: Df Model (6) is same as number of Xs (6).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Normal fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 29.066173911422652
Root Mean Squared Residual = 42.34393381042232
R² = 0.9928903218733488
Normal prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 26.7707801761965
Root Mean Squared Error = 38.27870342033717
R² = 0.9944462291696584
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 2
GLM Model Fitted = Poisson
1 entered.
Assuming Poisson error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:46 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
Poisson fit including transformed Xs:
Adjusted R² = 0.10761270518087696 , max(X p-value) = 2.6410020717100325e-184 , rank deficiency = 35 , for 45 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -0.05502827805304489 , max(X p-value) = 0.0 , rank deficiency = 25 , for 44 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = -inf , max(X p-value) = 0.7756314672635296 , rank deficiency = 22 , for 43 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.7021852164619822 , rank deficiency = 22 , for 42 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.38164885270568827 , max(X p-value) = 0.496049488022139 , rank deficiency = 24 , for 41 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.3798429854997377 , max(X p-value) = 0.056314925663405485 , rank deficiency = 23 , for 40 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.37694924465064117 , max(X p-value) = 2.7471985326561146e-09 , rank deficiency = 22 , for 39 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.6112581905609482 , rank deficiency = 19 , for 38 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.7126190445201326 , rank deficiency = 18 , for 37 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.950671646499198 , rank deficiency = 17 , for 36 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -0.04729486688574869 , max(X p-value) = 0.5329969208847569 , rank deficiency = 19 , for 35 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.3939202879622 , rank deficiency = 16 , for 34 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.3127256475567745 , rank deficiency = 15 , for 33 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.22774189272464768 , rank deficiency = 15 , for 32 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.07790463639652317 , rank deficiency = 15 , for 31 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.035877449012759285 , rank deficiency = 14 , for 30 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.12507730252998628 , max(X p-value) = 9.177327881775789e-40 , rank deficiency = 15 , for 29 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -0.039442184932307445 , max(X p-value) = 0.0 , rank deficiency = 14 , for 28 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.40061603377662003 , max(X p-value) = 2.3385946926103192e-132 , rank deficiency = 13 , for 27 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.4013764449578733 , max(X p-value) = 1.6026996256721276e-192 , rank deficiency = 13 , for 26 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.39899382562674846 , max(X p-value) = 2.2133788223261802e-208 , rank deficiency = 12 , for 25 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.396809074028029 , max(X p-value) = 0.0 , rank deficiency = 11 , for 24 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.4142034404869923 , max(X p-value) = 0.0 , rank deficiency = 11 , for 23 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9875981315870116 , max(X p-value) = 0.7970900394587686 , rank deficiency = 1 , for 22 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9876292489881712 , max(X p-value) = 0.496427103383434 , rank deficiency = 1 , for 21 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9876275412360199 , max(X p-value) = 0.6288080979045965 , rank deficiency = 1 , for 20 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9876457585531827 , max(X p-value) = 0.5366351046099397 , rank deficiency = 1 , for 19 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9876240962812989 , max(X p-value) = 0.4933067673185476 , rank deficiency = 0 , for 18 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9876326291645541 , max(X p-value) = 0.28380461479496333 , rank deficiency = 0 , for 17 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9875847905050127 , max(X p-value) = 0.38145099224074586 , rank deficiency = 0 , for 16 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9875745583040226 , max(X p-value) = 0.3033212061119781 , rank deficiency = 0 , for 15 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9875351150834836 , max(X p-value) = 0.5257591380669193 , rank deficiency = 0 , for 14 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9875308732800098 , max(X p-value) = 0.3648158501903497 , rank deficiency = 0 , for 13 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9875019210130457 , max(X p-value) = 0.16374274349453577 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9874039959044038 , max(X p-value) = 0.5576411141784767 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9874202447904344 , max(X p-value) = 0.20767158697013655 , rank deficiency = 0 , for 10 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9873377355065756 , max(X p-value) = 0.3938301858101325 , rank deficiency = 0 , for 9 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9873158955181962 , max(X p-value) = 0.09938954607017733 , rank deficiency = 0 , for 8 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9871768927494204 , max(X p-value) = 0.006567843687697745 , rank deficiency = 0 , for 7 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9866763723054723 , max(X p-value) = 0.31105525372983445 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9866243171203116 , max(X p-value) = 0.006301790845440342 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9862011814199727 , max(X p-value) = 7.47767222068124e-06 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9848629590711335 , max(X p-value) = 2.7958428529628562e-14 , rank deficiency = 0 , for 3 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9810501495095536 , max(X p-value) = 1.4534743615873629e-30 , rank deficiency = 0 , for 2 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9719822109943469 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price
Restarting from best model (with 7 Xs & Adjusted R² = 0.9871768927494204) found so far...
Adding 6 2-way interactions among 4 untransformed variables in best model found so far:
FTSE_High_Price_x_cumCasesByPublishDate
FTSE_High_Price_x_FTSE_Volume
FTSE_High_Price_x_cumDailyNsoDeathsByDeathDate
cumCasesByPublishDate_x_FTSE_Volume
cumCasesByPublishDate_x_cumDailyNsoDeathsByDeathDate
FTSE_Volume_x_cumDailyNsoDeathsByDeathDate
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.5745356068548819 , max(X p-value) = 7.817706738219301e-05 , rank deficiency = 3 , for 13 Xs.
Variable to drop: cumCasesByPublishDate_x_cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.5749559219287119 , max(X p-value) = 1.0408222920504333e-65 , rank deficiency = 3 , for 12 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.569873238331733 , max(X p-value) = 2.620534561040993e-171 , rank deficiency = 2 , for 11 Xs.
Variable to drop: FTSE_High_Price_x_cumCasesByPublishDate
Adjusted R² = 0.5598060785900254 , max(X p-value) = 0.0 , rank deficiency = 2 , for 10 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.5259083068656136 , max(X p-value) = 0.0 , rank deficiency = 2 , for 9 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.4474734727923152 , max(X p-value) = 0.0 , rank deficiency = 2 , for 8 Xs.
Variable to drop: cumCasesByPublishDate_x_FTSE_Volume
Adjusted R² = 0.4272420157718778 , max(X p-value) = 0.0 , rank deficiency = 1 , for 7 Xs.
Variable to drop: FTSE_Volume_x_cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9868093753009473 , max(X p-value) = 0.6279299018662838 , rank deficiency = 1 , for 6 Xs.
Variable to drop: FTSE_High_Price_x_FTSE_Volume
Adjusted R² = 0.9867866124430192 , max(X p-value) = 0.0022554822471929597 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9862084965026295 , max(X p-value) = 4.103700537822749e-05 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9850980210197775 , max(X p-value) = 5.960604637603362e-15 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_High_Price_x_cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9810501495095536 , max(X p-value) = 1.4534743615873629e-30 , rank deficiency = 0 , for 2 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9719822109943469 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price
Best model has 7 Xs (Adjusted R² = 0.9871768927494204 , rank deficiency = 0):
Results: Generalized linear model
==================================================================================
Model: GLM AIC: 3915.0172
Link Function: log BIC: -1826.0650
Dependent Variable: FTSE_Close_Price Log-Likelihood: -1949.5
Date: 2021-04-22 13:46 LL-Null: -8829.2
No. Observations: 351 Deviance: 184.18
Df Model: 7 Pearson chi2: 184.
Df Residuals: 343 Scale: 1.0000
Method: IRLS
----------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------
const 7.6419 0.0200 382.9768 0.0000 7.6028 7.6810
FTSE_High_Price 0.0002 0.0000 64.1542 0.0000 0.0002 0.0002
cumCasesByPublishDate -0.0000 0.0000 -2.8695 0.0041 -0.0000 -0.0000
FTSE_Volume -0.0000 0.0000 -4.3899 0.0000 -0.0000 -0.0000
cumAdmissions_sqar 0.0000 0.0000 3.7947 0.0001 0.0000 0.0000
cumCasesByPublishDate_sqrt 0.0001 0.0000 2.7180 0.0066 0.0000 0.0001
cumDailyNsoDeathsByDeathDate -0.0000 0.0000 -4.5962 0.0000 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate_sqrt 0.0004 0.0001 4.7742 0.0000 0.0002 0.0005
==================================================================================
Descending order of 7 X's significance, assuming Poisson error distribution:
Coefficient z-stat
const 7.641901e+00 382.976791
FTSE_High_Price 1.705967e-04 64.154223
cumDailyNsoDeathsByDeathDate_sqrt 3.668396e-04 4.774246
cumDailyNsoDeathsByDeathDate -1.618704e-06 -4.596236
FTSE_Volume -9.380393e-12 -4.389919
cumAdmissions_sqar 8.347297e-13 3.794675
cumCasesByPublishDate -3.948092e-08 -2.869460
cumCasesByPublishDate_sqrt 6.066188e-05 2.717997
Rank deficiency = 0: Df Model (7) is same as number of Xs (7).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Poisson fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 39.820736739086556
Root Mean Squared Residual = 56.29580283329556
R² = 0.9874333548944321
Poisson prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 38.13094713264823
Root Mean Squared Error = 49.21581604415208
R² = 0.9899814663214543
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 3
GLM Model Fitted = Logit
1 entered.
Assuming Logit error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:46 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
Logit fit including transformed Xs:
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 35 , for 45 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 34 , for 44 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 43 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 42 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 41 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 40 Xs.
Variable to drop: cumVirusTests
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 39 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 38 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 32 , for 37 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 31 , for 36 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 30 , for 35 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 34 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 33 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 32 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 31 Xs.
Variable to drop: cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 30 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 27 , for 29 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 27 , for 28 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 27 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 26 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 25 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 24 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 23 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 22 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 21 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 20 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 19 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 18 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 17 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 16 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 15 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 14 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 13 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 10 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 9 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 8 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 7 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 6 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Restarting from best model (with 27 Xs & Adjusted R² = -inf) found so far...
Adding 45 2-way interactions among 10 untransformed variables in best model found so far:
FTSE_Low_Price_x_FTSE_High_Price
FTSE_Low_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_SPY_Open_Price
FTSE_Low_Price_x_US_Covid_Deaths
FTSE_Low_Price_x_Unnamed__0
FTSE_Low_Price_x_Sun_Hours
FTSE_Low_Price_x_Max_Temperature_DegC
FTSE_Low_Price_x_Rainfall_mm
FTSE_Low_Price_x_Min_Temperature_DegC
FTSE_High_Price_x_FTSE_Open_Price
FTSE_High_Price_x_SPY_Open_Price
FTSE_High_Price_x_US_Covid_Deaths
FTSE_High_Price_x_Unnamed__0
FTSE_High_Price_x_Sun_Hours
FTSE_High_Price_x_Max_Temperature_DegC
FTSE_High_Price_x_Rainfall_mm
FTSE_High_Price_x_Min_Temperature_DegC
FTSE_Open_Price_x_SPY_Open_Price
FTSE_Open_Price_x_US_Covid_Deaths
FTSE_Open_Price_x_Unnamed__0
FTSE_Open_Price_x_Sun_Hours
FTSE_Open_Price_x_Max_Temperature_DegC
FTSE_Open_Price_x_Rainfall_mm
FTSE_Open_Price_x_Min_Temperature_DegC
SPY_Open_Price_x_US_Covid_Deaths
SPY_Open_Price_x_Unnamed__0
SPY_Open_Price_x_Sun_Hours
SPY_Open_Price_x_Max_Temperature_DegC
SPY_Open_Price_x_Rainfall_mm
SPY_Open_Price_x_Min_Temperature_DegC
US_Covid_Deaths_x_Unnamed__0
US_Covid_Deaths_x_Sun_Hours
US_Covid_Deaths_x_Max_Temperature_DegC
US_Covid_Deaths_x_Rainfall_mm
US_Covid_Deaths_x_Min_Temperature_DegC
Unnamed__0_x_Sun_Hours
Unnamed__0_x_Max_Temperature_DegC
Unnamed__0_x_Rainfall_mm
Unnamed__0_x_Min_Temperature_DegC
Sun_Hours_x_Max_Temperature_DegC
Sun_Hours_x_Rainfall_mm
Sun_Hours_x_Min_Temperature_DegC
Max_Temperature_DegC_x_Rainfall_mm
Max_Temperature_DegC_x_Min_Temperature_DegC
Rainfall_mm_x_Min_Temperature_DegC
X pairs with correlations > 0.995 :
FTSE_High_Price_x_Unnamed__0 , FTSE_Open_Price_x_Unnamed__0
FTSE_Open_Price_x_Unnamed__0 , FTSE_Low_Price_x_Unnamed__0
US_Covid_Cases_sqrt , SPY_Open_Price_x_Unnamed__0
US_Covid_Deaths_x_Min_Temperature_DegC , US_Covid_Deaths_x_Sun_Hours
US_Covid_Deaths_x_Sun_Hours , US_Covid_Deaths_x_Max_Temperature_DegC
FTSE_High_Price_x_US_Covid_Deaths , FTSE_Open_Price_x_US_Covid_Deaths
FTSE_Open_Price_x_US_Covid_Deaths , FTSE_Low_Price_x_US_Covid_Deaths
FTSE_Low_Price_x_US_Covid_Deaths , SPY_Open_Price_x_US_Covid_Deaths
FTSE_Low_Price_x_SPY_Open_Price , FTSE_Open_Price_x_SPY_Open_Price
FTSE_Open_Price , FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price , FTSE_Low_Price_x_FTSE_High_Price
11 variables considered for deletion:
FTSE_High_Price_x_Unnamed__0
FTSE_Open_Price_x_Unnamed__0
SPY_Open_Price_x_Unnamed__0
US_Covid_Deaths_x_Min_Temperature_DegC
US_Covid_Deaths_x_Max_Temperature_DegC
FTSE_High_Price_x_US_Covid_Deaths
FTSE_Open_Price_x_US_Covid_Deaths
FTSE_Low_Price_x_US_Covid_Deaths
FTSE_Open_Price_x_SPY_Open_Price
FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price
X pairs with correlations > 0.995 :
FTSE_Low_Price_x_SPY_Open_Price , FTSE_High_Price_x_SPY_Open_Price
FTSE_High_Price , FTSE_Low_Price_x_FTSE_High_Price
2 variables considered for deletion:
FTSE_High_Price_x_SPY_Open_Price
FTSE_Low_Price_x_FTSE_High_Price
X pairs with correlations > 0.995 :
(no more)
13 interaction variables deleted.
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 59 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 58 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 57 Xs.
Variable to drop: FTSE_Low_Price_x_SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 56 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 55 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 54 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 53 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 52 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 51 Xs.
Variable to drop: SPY_Open_Price_x_US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 50 Xs.
Variable to drop: Unnamed__0_x_Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 27 , for 49 Xs.
Variable to drop: FTSE_High_Price_x_Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 26 , for 48 Xs.
Variable to drop: FTSE_Open_Price_x_Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 25 , for 47 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 25 , for 46 Xs.
Variable to drop: FTSE_Open_Price_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 24 , for 45 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 23 , for 44 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 22 , for 43 Xs.
Variable to drop: FTSE_Low_Price_x_Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 21 , for 42 Xs.
Variable to drop: FTSE_High_Price_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 20 , for 41 Xs.
Variable to drop: Max_Temperature_DegC_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 19 , for 40 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 18 , for 39 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 17 , for 38 Xs.
Variable to drop: Sun_Hours_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 16 , for 37 Xs.
Variable to drop: Rainfall_mm_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 15 , for 36 Xs.
Variable to drop: Max_Temperature_DegC_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 14 , for 35 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 13 , for 34 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 12 , for 33 Xs.
Variable to drop: SPY_Open_Price_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 11 , for 32 Xs.
Variable to drop: Unnamed__0_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 10 , for 31 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 9 , for 30 Xs.
Variable to drop: Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 8 , for 29 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 7 , for 28 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 27 Xs.
Variable to drop: SPY_Open_Price_x_Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 5 , for 26 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 4 , for 25 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 24 Xs.
Variable to drop: FTSE_High_Price_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 2 , for 23 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 22 Xs.
Variable to drop: FTSE_Open_Price_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 21 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 20 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 19 Xs.
Variable to drop: US_Covid_Deaths_x_Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 18 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 17 Xs.
Variable to drop: FTSE_Low_Price_x_Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 16 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 15 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 14 Xs.
Variable to drop: Sun_Hours_x_Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 13 Xs.
Variable to drop: Sun_Hours_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Unnamed__0_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Unnamed__0_x_Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 10 Xs.
Variable to drop: FTSE_Low_Price_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 9 Xs.
Variable to drop: SPY_Open_Price_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 8 Xs.
Variable to drop: US_Covid_Deaths_x_Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 7 Xs.
Variable to drop: US_Covid_Deaths_x_Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 6 Xs.
Variable to drop: SPY_Open_Price_x_Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: FTSE_High_Price_x_Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Low_Price_x_Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price_x_Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_x_Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Open_Price
Best model has 21 Xs (Adjusted R² = -inf , rank deficiency = 0):
Results: Generalized linear model
=============================================================================================================================================================
Model: GLM AIC: nan
Link Function: logit BIC: 198184550.8735
Dependent Variable: FTSE_Close_Price Log-Likelihood: nan
Date: 2021-04-22 13:46 LL-Null: nan
No. Observations: 351 Deviance: 1.9819e+08
Df Model: 21 Pearson chi2: 6.32e+25
Df Residuals: 329 Scale: 1.0000
Method: IRLS
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------------------------------------------------
const -396553779158712320.0000 2920687222.9826 -135774134.2648 0.0000 -396553784883154112.0000 -396553773434270528.0000
FTSE_Low_Price -22443684580540416.0000 1870860.3257 -11996451190.1955 0.0000 -22443684584207236.0000 -22443684576873596.0000
FTSE_Open_Price 52295855338522816.0000 1592573.0305 32837335769.3493 0.0000 52295855335401432.0000 52295855341644200.0000
FTSE_High_Price_x_Max_Temperature_DegC 325923046343490.7500 6353.8040 51295734776.6307 0.0000 325923046331037.5000 325923046355944.0000
FTSE_Low_Price_x_Rainfall_mm -365813923118099.0000 16730.1864 -21865501968.6400 0.0000 -365813923150889.5625 -365813923085308.4375
SPY_Open_Price_sqrt -3655400242191155200.0000 275087613.1772 -13288131006.5977 0.0000 -3655400242730316800.0000 -3655400241651993600.0000
FTSE_Low_Price_x_Min_Temperature_DegC -727738913658670.0000 22333.9976 -32584355344.6081 0.0000 -727738913702443.8750 -727738913614896.1250
SPY_Open_Price_x_Rainfall_mm 6403891227581240.0000 203821.8818 31419056531.2728 0.0000 6403891227181756.0000 6403891227980724.0000
FTSE_Low_Price_x_Sun_Hours 445767841978949.5000 13182.0782 33816203619.8827 0.0000 445767841953113.1250 445767842004785.8750
FTSE_Open_Price_x_Sun_Hours -407292251878286.5000 11957.6778 -34061149554.7584 0.0000 -407292251901723.1250 -407292251854849.8750
SPY_Open_Price_x_Sun_Hours -2889375588642639.0000 98866.9327 -29224893608.9409 0.0000 -2889375588836414.5000 -2889375588448863.5000
US_Covid_Deaths_x_Unnamed__0 15859383544.6015 1.1568 13709131840.0783 0.0000 15859383542.3341 15859383546.8689
US_Covid_Deaths_x_Rainfall_mm -2769930949167.0547 88.0005 -31476301275.0312 0.0000 -2769930949339.5327 -2769930948994.5767
US_Covid_Deaths_x_Sun_Hours 1469243584637.4883 50.0806 29337550983.8899 0.0000 1469243584539.3320 1469243584735.6445
FTSE_Low_Price_x_Unnamed__0 -2837209987445.1289 153.0530 -18537440321.1849 0.0000 -2837209987745.1074 -2837209987145.1504
Unnamed__0_x_Rainfall_mm 2017536529377992.0000 75250.9598 26810774719.7731 0.0000 2017536529230502.7500 2017536529525481.2500
Unnamed__0_x_Max_Temperature_DegC -9655253596446320.0000 385460.4726 -25048621795.3288 0.0000 -9655253597201808.0000 -9655253595690832.0000
US_Covid_Deaths_sqrt 5135798668675872.0000 276920.3373 18546123118.0340 0.0000 5135798668133118.0000 5135798669218626.0000
Sun_Hours_x_Max_Temperature_DegC 17264802043283936.0000 711064.4422 24280221340.0228 0.0000 17264802041890276.0000 17264802044677596.0000
Rainfall_mm -673410396162842624.0000 43878201.7926 -15347265126.0082 0.0000 -673410396248842368.0000 -673410396076842880.0000
Rainfall_mm_sqar 3027754398718784.0000 169487.6296 17864161563.1344 0.0000 3027754398386594.5000 3027754399050973.5000
Sun_Hours_x_Rainfall_mm 3018022516016296.0000 138520.3386 21787576806.4056 0.0000 3018022515744801.0000 3018022516287791.0000
=============================================================================================================================================================
Descending order of 21 X's significance, assuming Logit error distribution:
Coefficient z-stat
const -3.965538e+17 -1.357741e+08
FTSE_High_Price_x_Max_Temperature_DegC 3.259230e+14 5.129573e+10
FTSE_Open_Price_x_Sun_Hours -4.072923e+14 -3.406115e+10
FTSE_Low_Price_x_Sun_Hours 4.457678e+14 3.381620e+10
FTSE_Open_Price 5.229586e+16 3.283734e+10
FTSE_Low_Price_x_Min_Temperature_DegC -7.277389e+14 -3.258436e+10
US_Covid_Deaths_x_Rainfall_mm -2.769931e+12 -3.147630e+10
SPY_Open_Price_x_Rainfall_mm 6.403891e+15 3.141906e+10
US_Covid_Deaths_x_Sun_Hours 1.469244e+12 2.933755e+10
SPY_Open_Price_x_Sun_Hours -2.889376e+15 -2.922489e+10
Unnamed__0_x_Rainfall_mm 2.017537e+15 2.681077e+10
Unnamed__0_x_Max_Temperature_DegC -9.655254e+15 -2.504862e+10
Sun_Hours_x_Max_Temperature_DegC 1.726480e+16 2.428022e+10
FTSE_Low_Price_x_Rainfall_mm -3.658139e+14 -2.186550e+10
Sun_Hours_x_Rainfall_mm 3.018023e+15 2.178758e+10
US_Covid_Deaths_sqrt 5.135799e+15 1.854612e+10
FTSE_Low_Price_x_Unnamed__0 -2.837210e+12 -1.853744e+10
Rainfall_mm_sqar 3.027754e+15 1.786416e+10
Rainfall_mm -6.734104e+17 -1.534727e+10
US_Covid_Deaths_x_Unnamed__0 1.585938e+10 1.370913e+10
SPY_Open_Price_sqrt -3.655400e+18 -1.328813e+10
FTSE_Low_Price -2.244368e+16 -1.199645e+10
Rank deficiency = 0: Df Model (21) is same as number of Xs (21).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Logit fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 6302.139515669511
Root Mean Squared Residual = 6322.116350393164
R² = nan
Logit prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 6223.950000000001
Root Mean Squared Error = 6243.163068116265
R² = nan
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 4
GLM Model Fitted = Probit
1 entered.
Assuming Probit error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:46 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
Probit fit including transformed Xs:
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 35 , for 45 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 34 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 43 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 42 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 32 , for 41 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 32 , for 40 Xs.
Variable to drop: cumVirusTests
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 32 , for 39 Xs.
Variable to drop: cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 31 , for 38 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 31 , for 37 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 30 , for 36 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 35 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 34 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 33 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 32 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 27 , for 31 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 27 , for 30 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 27 , for 29 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 17 , for 28 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 16 , for 27 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 15 , for 26 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 15 , for 25 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 14 , for 24 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 13 , for 23 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 12 , for 22 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 12 , for 21 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 11 , for 20 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 10 , for 19 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 9 , for 18 Xs.
Variable to drop: Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 8 , for 17 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 7 , for 16 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 15 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 14 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 5 , for 13 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 5 , for 12 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 4 , for 11 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 10 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 9 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 8 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 2 , for 7 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 6 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Restarting from best model (with 6 Xs & Adjusted R² = -inf) found so far...
Adding 6 2-way interactions among 4 untransformed variables in best model found so far:
FTSE_Low_Price_x_FTSE_High_Price
FTSE_Low_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_US_Covid_Cases
FTSE_High_Price_x_FTSE_Open_Price
FTSE_High_Price_x_US_Covid_Cases
FTSE_Open_Price_x_US_Covid_Cases
X pairs with correlations > 0.995 :
FTSE_Open_Price_x_US_Covid_Cases , FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases , FTSE_Low_Price_x_US_Covid_Cases
FTSE_Open_Price , FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price , FTSE_Low_Price_x_FTSE_High_Price
4 variables considered for deletion:
FTSE_Open_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price
X pairs with correlations > 0.995 :
FTSE_High_Price , FTSE_Low_Price_x_FTSE_High_Price
1 variables considered for deletion:
FTSE_Low_Price_x_FTSE_High_Price
X pairs with correlations > 0.995 :
(no more)
5 interaction variables deleted.
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 7 Xs.
Variable to drop: FTSE_Low_Price_x_US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 6 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Best model has 7 Xs (Adjusted R² = -inf , rank deficiency = 0):
Results: Generalized linear model
===========================================================================================================================================
Model: GLM AIC: nan
Link Function: probit BIC: 198184468.8225
Dependent Variable: FTSE_Close_Price Log-Likelihood: nan
Date: 2021-04-22 13:46 LL-Null: nan
No. Observations: 351 Deviance: 1.9819e+08
Df Model: 7 Pearson chi2: 6.32e+25
Df Residuals: 343 Scale: 1.0000
Method: IRLS
-------------------------------------------------------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------------------------------
const 39262322297915904.0000 12221803.1098 3212481983.6506 0.0000 39262322273961608.0000 39262322321870200.0000
FTSE_Low_Price 692734622538850.2500 7206.9953 96119755069.9408 0.0000 692734622524724.7500 692734622552975.7500
FTSE_High_Price 419944836245586.2500 9195.8778 45666639498.6726 0.0000 419944836227562.6875 419944836263609.8125
FTSE_Open_Price -302446720770904.5000 6883.8228 -43935866991.1720 0.0000 -302446720784396.5625 -302446720757412.4375
FTSE_Low_Price_x_US_Covid_Cases -243693.2909 0.0002 -1039055571.7220 0.0000 -243693.2913 -243693.2904
US_Covid_Cases 2303083953.9685 1.6918 1361349300.4185 0.0000 2303083950.6527 2303083957.2843
US_Covid_Cases_sqrt -3625205786817.3750 1446.1234 -2506844021.2338 0.0000 -3625205789651.7246 -3625205783983.0254
Min_Temperature_DegC_sqrt 13528909892940032.0000 3196993.6433 4231760022.8560 0.0000 13528909886674040.0000 13528909899206024.0000
===========================================================================================================================================
Descending order of 7 X's significance, assuming Probit error distribution:
Coefficient z-stat
const 3.926232e+16 3.212482e+09
FTSE_Low_Price 6.927346e+14 9.611976e+10
FTSE_High_Price 4.199448e+14 4.566664e+10
FTSE_Open_Price -3.024467e+14 -4.393587e+10
Min_Temperature_DegC_sqrt 1.352891e+16 4.231760e+09
US_Covid_Cases_sqrt -3.625206e+12 -2.506844e+09
US_Covid_Cases 2.303084e+09 1.361349e+09
FTSE_Low_Price_x_US_Covid_Cases -2.436933e+05 -1.039056e+09
Rank deficiency = 0: Df Model (7) is same as number of Xs (7).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Probit fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 6302.139515669511
Root Mean Squared Residual = 6322.116350393164
R² = nan
Probit prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 6223.950000000001
Root Mean Squared Error = 6243.163068116265
R² = nan
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 5
GLM Model Fitted = Gamma
1 entered.
Assuming Gamma error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:46 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
Gamma fit including transformed Xs:
Adjusted R² = 0.37321232663135384 , max(X p-value) = 0.8227771111117455 , rank deficiency = 35 , for 45 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.3817027267569263 , max(X p-value) = 0.9133223016974729 , rank deficiency = 35 , for 44 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.9583513955445756 , rank deficiency = 22 , for 43 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.8670178283439143 , rank deficiency = 22 , for 42 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = -inf , max(X p-value) = 0.8721015004742216 , rank deficiency = 21 , for 41 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.9463371630053747 , rank deficiency = 20 , for 40 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.9697565468850384 , rank deficiency = 19 , for 39 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9007115035626743 , rank deficiency = 18 , for 38 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.7590218113874654 , rank deficiency = 17 , for 37 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.529756653727293 , rank deficiency = 16 , for 36 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.7963428545627439 , max(X p-value) = 0.9600274603729214 , rank deficiency = 18 , for 35 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.7799575267144208 , max(X p-value) = 0.8558550342137923 , rank deficiency = 17 , for 34 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.8552128167056167 , rank deficiency = 14 , for 33 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.046767388006549754 , rank deficiency = 14 , for 32 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = -0.03859663928642432 , max(X p-value) = 0.8764440284264785 , rank deficiency = 16 , for 31 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.8360396927307613 , max(X p-value) = 1.2802138067736917e-09 , rank deficiency = 15 , for 30 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.8338434503965254 , max(X p-value) = 2.9071576493793694e-06 , rank deficiency = 15 , for 29 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = -0.03719598880857977 , max(X p-value) = 0.8868276780690109 , rank deficiency = 15 , for 28 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.8388542113722706 , max(X p-value) = 9.41352971204273e-07 , rank deficiency = 14 , for 27 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.8360698531773686 , max(X p-value) = 3.3059114449237743e-16 , rank deficiency = 13 , for 26 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.8601927025947954 , max(X p-value) = 5.5455617738810264e-12 , rank deficiency = 13 , for 25 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.85856516707022 , max(X p-value) = 0.00041249711766288796 , rank deficiency = 12 , for 24 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.857946826284467 , max(X p-value) = 1.5989497894868645e-20 , rank deficiency = 12 , for 23 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.8479144852584422 , max(X p-value) = 8.346100661708606e-19 , rank deficiency = 12 , for 22 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.8391890273711808 , max(X p-value) = 7.050780378790794e-09 , rank deficiency = 12 , for 21 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9599453439015769 , max(X p-value) = 0.9537680921989193 , rank deficiency = 11 , for 20 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9599536068334957 , max(X p-value) = 0.28556533593804123 , rank deficiency = 10 , for 19 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.960050414842225 , max(X p-value) = 0.09222365313644462 , rank deficiency = 10 , for 18 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9600470577853288 , max(X p-value) = 0.0021745127640017117 , rank deficiency = 9 , for 17 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.959372209820619 , max(X p-value) = 0.024706459334142634 , rank deficiency = 8 , for 16 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9592490639627856 , max(X p-value) = 0.9976293599041667 , rank deficiency = 7 , for 15 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9593139649926342 , max(X p-value) = 0.004285264635151392 , rank deficiency = 6 , for 14 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9584822348893663 , max(X p-value) = 0.0003867048412889778 , rank deficiency = 5 , for 13 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9570626721514651 , max(X p-value) = 0.0005222867572889243 , rank deficiency = 5 , for 12 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9568812932281126 , max(X p-value) = 9.861069633035358e-06 , rank deficiency = 5 , for 11 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9560791699715184 , max(X p-value) = 2.3020587925336513e-07 , rank deficiency = 4 , for 10 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9551793982050498 , max(X p-value) = 2.299080491391885e-18 , rank deficiency = 4 , for 9 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9524665296938745 , max(X p-value) = 3.2433369144503537e-16 , rank deficiency = 4 , for 8 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9507350047935229 , max(X p-value) = 9.101084699753965e-80 , rank deficiency = 4 , for 7 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9372277535749304 , max(X p-value) = 4.679134657472113e-75 , rank deficiency = 3 , for 6 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9249785232689327 , max(X p-value) = 0.0 , rank deficiency = 3 , for 5 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.986301698390417 , max(X p-value) = 2.928891195115542e-10 , rank deficiency = 0 , for 4 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9845402063723713 , max(X p-value) = 0.006025613235026866 , rank deficiency = 0 , for 3 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9843804185379033 , max(X p-value) = 3.852643308568676e-12 , rank deficiency = 0 , for 2 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9816081749919106 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Restarting from best model (with 4 Xs & Adjusted R² = 0.986301698390417) found so far...
Adding 1 2-way interactions among 2 untransformed variables in best model found so far:
FTSE_Low_Price_x_SPY_Open_Price
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.9870583265480315 , max(X p-value) = 3.8432282533188306e-05 , rank deficiency = 0 , for 5 Xs.
Variable to drop: FTSE_Low_Price_x_SPY_Open_Price
Adjusted R² = 0.986301698390417 , max(X p-value) = 2.928891195115542e-10 , rank deficiency = 0 , for 4 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9845402063723713 , max(X p-value) = 0.006025613235026866 , rank deficiency = 0 , for 3 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9843804185379033 , max(X p-value) = 3.852643308568676e-12 , rank deficiency = 0 , for 2 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9816081749919106 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Best model has 5 Xs (Adjusted R² = 0.9870583265480315 , rank deficiency = 0):
Results: Generalized linear model
===============================================================================
Model: GLM AIC: 3871.5662
Link Function: inverse_power BIC: -2021.9402
Dependent Variable: FTSE_Close_Price Log-Likelihood: -1929.8
Date: 2021-04-22 13:47 LL-Null: -13935.
No. Observations: 351 Deviance: 0.031031
Df Model: 5 Pearson chi2: 0.0312
Df Residuals: 345 Scale: 9.0471e-05
Method: IRLS
-------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------
const 0.0014 0.0002 5.7724 0.0000 0.0009 0.0019
FTSE_Low_Price -0.0000 0.0000 -9.0306 0.0000 -0.0000 -0.0000
FTSE_Low_Price_x_SPY_Open_Price 0.0000 0.0000 4.1167 0.0000 0.0000 0.0000
SPY_Open_Price_sqrt -0.0002 0.0000 -4.1647 0.0000 -0.0002 -0.0001
SPY_Open_Price 0.0000 0.0000 4.1300 0.0000 0.0000 0.0000
SPY_Open_Price_sqar -0.0000 0.0000 -5.0120 0.0000 -0.0000 -0.0000
===============================================================================
Descending order of 5 X's significance, assuming Gamma error distribution:
Coefficient z-stat
const 1.419225e-03 5.772386
FTSE_Low_Price -4.134587e-08 -9.030638
SPY_Open_Price_sqar -3.867328e-09 -5.012043
SPY_Open_Price_sqrt -1.568142e-04 -4.164724
SPY_Open_Price 6.521268e-06 4.130024
FTSE_Low_Price_x_SPY_Open_Price 5.656823e-11 4.116708
Rank deficiency = 0: Df Model (5) is same as number of Xs (5).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Gamma fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 43.918281573122556
Root Mean Squared Residual = 56.73677274645222
R² = 0.9872432075973453
Gamma prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 42.0210908171442
Root Mean Squared Error = 55.371810541037576
R² = 0.9875464090867117
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 6
GLM Model Fitted = InverseGaussian
1 entered.
Assuming InverseGaussian error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:47 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
InverseGaussian fit including transformed Xs:
Adjusted R² = 0.6269262364830994 , max(X p-value) = 0.8684214866107944 , rank deficiency = 35 , for 45 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.9435650977947586 , rank deficiency = 23 , for 44 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.6832221614340215 , rank deficiency = 22 , for 43 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.7962776698926084 , rank deficiency = 21 , for 42 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.2663136800184301 , rank deficiency = 20 , for 41 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.255114455064999 , rank deficiency = 19 , for 40 Xs.
Variable to drop: cumVirusTests
Adjusted R² = -inf , max(X p-value) = 0.09929160125135307 , rank deficiency = 19 , for 39 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = -inf , max(X p-value) = 0.014768107967592944 , rank deficiency = 18 , for 38 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.06155461955697827 , rank deficiency = 18 , for 37 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.03179291556782809 , rank deficiency = 18 , for 36 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9162080138226437 , max(X p-value) = 0.9553218426459781 , rank deficiency = 18 , for 35 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.7756182148097922 , max(X p-value) = 0.7954895687684047 , rank deficiency = 18 , for 34 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9480570810066657 , max(X p-value) = 0.97693485968716 , rank deficiency = 18 , for 33 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9055954561440502 , max(X p-value) = 0.9758754707363995 , rank deficiency = 17 , for 32 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9435960199386255 , max(X p-value) = 0.9179620566792952 , rank deficiency = 16 , for 31 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.9930546511445221 , rank deficiency = 10 , for 30 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = -inf , max(X p-value) = 0.8770591157062435 , rank deficiency = 9 , for 29 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9741361196938104 , rank deficiency = 9 , for 28 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9652692067378577 , max(X p-value) = 0.7177030995699706 , rank deficiency = 13 , for 27 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9653342491762675 , max(X p-value) = 0.1628269602814031 , rank deficiency = 13 , for 26 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.965420068909231 , max(X p-value) = 0.12881319571755126 , rank deficiency = 13 , for 25 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.9652666729803271 , max(X p-value) = 0.21042782458193476 , rank deficiency = 13 , for 24 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9651559061994462 , max(X p-value) = 0.19456472803393587 , rank deficiency = 12 , for 23 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.9651670797868716 , max(X p-value) = 0.1505071823006346 , rank deficiency = 11 , for 22 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9649906267828705 , max(X p-value) = 0.458670713942413 , rank deficiency = 10 , for 21 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9649611961492371 , max(X p-value) = 0.11787277690399688 , rank deficiency = 9 , for 20 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9648985867806816 , max(X p-value) = 0.00971556497859933 , rank deficiency = 8 , for 19 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.9640767786533951 , max(X p-value) = 0.06734088040760255 , rank deficiency = 7 , for 18 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9634740343354187 , max(X p-value) = 0.0006946929376341227 , rank deficiency = 7 , for 17 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9626638281611359 , max(X p-value) = 0.0018351166943530923 , rank deficiency = 7 , for 16 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.9840954192858459 , max(X p-value) = 0.6398126408364406 , rank deficiency = 1 , for 15 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9841468353955174 , max(X p-value) = 0.4961556821735509 , rank deficiency = 1 , for 14 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9841412514280568 , max(X p-value) = 0.3312313706164757 , rank deficiency = 1 , for 13 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.984153044187247 , max(X p-value) = 0.11764188542972664 , rank deficiency = 1 , for 12 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9840779808922845 , max(X p-value) = 0.6575428837855261 , rank deficiency = 1 , for 11 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9841243802378388 , max(X p-value) = 0.061002134099372 , rank deficiency = 1 , for 10 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9839498993367828 , max(X p-value) = 0.5491301709353267 , rank deficiency = 1 , for 9 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9839725430692731 , max(X p-value) = 0.012674931543563783 , rank deficiency = 1 , for 8 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9837067831758726 , max(X p-value) = 0.00044647633896881044 , rank deficiency = 1 , for 7 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9834786050875161 , max(X p-value) = 9.145686996157706e-07 , rank deficiency = 1 , for 6 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9819349675426327 , max(X p-value) = 1.861617650274361e-07 , rank deficiency = 1 , for 5 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9806159353612207 , max(X p-value) = 1.0053002240165849e-18 , rank deficiency = 0 , for 4 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9765668480968436 , max(X p-value) = 1.9112754826503526e-18 , rank deficiency = 0 , for 3 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9726009271776132 , max(X p-value) = 1.4740903256652516e-57 , rank deficiency = 0 , for 2 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9475994623344095 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price
Restarting from best model (with 4 Xs & Adjusted R² = 0.9806159353612207) found so far...
Adding 3 2-way interactions among 3 untransformed variables in best model found so far:
FTSE_High_Price_x_SPY_Open_Price
FTSE_High_Price_x_cumAdmissions
SPY_Open_Price_x_cumAdmissions
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.9816747606563158 , max(X p-value) = 0.8652318407770181 , rank deficiency = 0 , for 7 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9817236408787345 , max(X p-value) = 0.14556865828039667 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9817787549424257 , max(X p-value) = 8.160597116494757e-13 , rank deficiency = 0 , for 5 Xs.
Variable to drop: SPY_Open_Price_x_cumAdmissions
Adjusted R² = 0.9791165963211118 , max(X p-value) = 1.858100319301856e-13 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_High_Price_x_cumAdmissions
Adjusted R² = 0.9763758280597937 , max(X p-value) = 8.283185248463431e-17 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_High_Price_x_SPY_Open_Price
Adjusted R² = 0.9726009271776132 , max(X p-value) = 1.4740903256652516e-57 , rank deficiency = 0 , for 2 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9475994623344095 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price
Best model has 5 Xs (Adjusted R² = 0.9817787549424257 , rank deficiency = 0):
Results: Generalized linear model
=================================================================================
Model: GLM AIC: 4011.8036
Link Function: inverse_squared BIC: -2021.9712
Dependent Variable: FTSE_Close_Price Log-Likelihood: -1999.9
Date: 2021-04-22 13:47 LL-Null: -10128.
No. Observations: 351 Deviance: 7.3660e-06
Df Model: 5 Pearson chi2: 7.25e-06
Df Residuals: 345 Scale: 2.1027e-08
Method: IRLS
---------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------
const 0.0000 0.0000 19.1815 0.0000 0.0000 0.0000
FTSE_High_Price -0.0000 0.0000 -17.5067 0.0000 -0.0000 -0.0000
FTSE_High_Price_x_SPY_Open_Price 0.0000 0.0000 8.3248 0.0000 0.0000 0.0000
SPY_Open_Price_sqrt -0.0000 0.0000 -10.3984 0.0000 -0.0000 -0.0000
SPY_Open_Price_x_cumAdmissions 0.0000 0.0000 7.1584 0.0000 0.0000 0.0000
FTSE_High_Price_x_cumAdmissions -0.0000 0.0000 -8.0676 0.0000 -0.0000 -0.0000
=================================================================================
Descending order of 5 X's significance, assuming InverseGaussian error distribution:
Coefficient z-stat
const 1.525653e-07 19.181515
FTSE_High_Price -1.311390e-11 -17.506696
SPY_Open_Price_sqrt -4.463141e-09 -10.398376
FTSE_High_Price_x_SPY_Open_Price 1.812011e-14 8.324806
FTSE_High_Price_x_cumAdmissions -6.094218e-18 -8.067574
SPY_Open_Price_x_cumAdmissions 9.755069e-17 7.158430
Rank deficiency = 0: Df Model (5) is same as number of Xs (5).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
InverseGaussian fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 50.255959989669996
Root Mean Squared Residual = 67.30867198529681
R² = 0.9820390584432482
InverseGaussian prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 42.84020035436302
Root Mean Squared Error = 55.431801899440806
R² = 0.9873544647627023
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 7
GLM Model Fitted = NegativeBinomial
1 entered.
Assuming NegativeBinomial error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:47 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
NegativeBinomial fit including transformed Xs:
Adjusted R² = 0.10556219477187478 , max(X p-value) = 0.7917911587564879 , rank deficiency = 35 , for 45 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.9435650977947586 , rank deficiency = 23 , for 44 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = -0.05355034368110845 , max(X p-value) = 0.9975829812799926 , rank deficiency = 25 , for 43 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -0.0537055641504498 , max(X p-value) = 0.9960242941934189 , rank deficiency = 24 , for 42 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = -inf , max(X p-value) = 0.9608550656310355 , rank deficiency = 20 , for 41 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.9973669271050765 , rank deficiency = 20 , for 40 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.9558056240782696 , rank deficiency = 19 , for 39 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = -inf , max(X p-value) = 0.5233467433229444 , rank deficiency = 19 , for 38 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.3699536839240257 , rank deficiency = 18 , for 37 Xs.
Variable to drop: cumVirusTests
Adjusted R² = -inf , max(X p-value) = 0.06674223971669034 , rank deficiency = 17 , for 36 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = -inf , max(X p-value) = 0.0006908523565386033 , rank deficiency = 17 , for 35 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 8.66411976068563e-05 , rank deficiency = 17 , for 34 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = -inf , max(X p-value) = 0.9898939915913645 , rank deficiency = 12 , for 33 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.02156212770937549 , max(X p-value) = 0.9583401041951407 , rank deficiency = 16 , for 32 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.6587569534202322 , max(X p-value) = 0.9882514310340098 , rank deficiency = 16 , for 31 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9734845713522413 , rank deficiency = 10 , for 30 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.6251577529741614 , max(X p-value) = 0.995982834402968 , rank deficiency = 15 , for 29 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.1994060136215957 , max(X p-value) = 0.994932947842515 , rank deficiency = 15 , for 28 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -0.03519797217413956 , max(X p-value) = 0.9754743022239978 , rank deficiency = 15 , for 27 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.8147324259774156 , rank deficiency = 9 , for 26 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.8189359623660823 , max(X p-value) = 0.9973250458281455 , rank deficiency = 13 , for 25 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.8193551235847633 , max(X p-value) = 0.9735324232578937 , rank deficiency = 13 , for 24 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.8202830438131821 , max(X p-value) = 0.88295630773399 , rank deficiency = 12 , for 23 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.8100094344830042 , max(X p-value) = 0.9356589759737556 , rank deficiency = 12 , for 22 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.8100918213094912 , max(X p-value) = 0.8962781645093912 , rank deficiency = 11 , for 21 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.9867513439711774 , max(X p-value) = 0.9990331658921388 , rank deficiency = 2 , for 20 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.986752630785516 , max(X p-value) = 0.9964065939588165 , rank deficiency = 1 , for 19 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9867892550150539 , max(X p-value) = 0.995439217066314 , rank deficiency = 1 , for 18 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9868258804627718 , max(X p-value) = 0.9937026596441347 , rank deficiency = 1 , for 17 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.9868273795097414 , max(X p-value) = 0.9924625203102213 , rank deficiency = 1 , for 16 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9868162410403454 , max(X p-value) = 0.9907495752625315 , rank deficiency = 1 , for 15 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9868003538219083 , max(X p-value) = 0.9851379820962101 , rank deficiency = 1 , for 14 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9867193043091383 , max(X p-value) = 0.9835476321433216 , rank deficiency = 1 , for 13 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9865918193036649 , max(X p-value) = 0.9891356808637602 , rank deficiency = 1 , for 12 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9865283795571441 , max(X p-value) = 0.9951479446884365 , rank deficiency = 1 , for 11 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.986549952375931 , max(X p-value) = 0.9806679900434531 , rank deficiency = 1 , for 10 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9863421703339775 , max(X p-value) = 0.992874255712171 , rank deficiency = 1 , for 9 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9863263893798977 , max(X p-value) = 0.9803006927309468 , rank deficiency = 1 , for 8 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9862278187633908 , max(X p-value) = 0.9956424412833313 , rank deficiency = 1 , for 7 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9862389739676911 , max(X p-value) = 0.9703990949919793 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9857067242075774 , max(X p-value) = 0.9655247900167468 , rank deficiency = 0 , for 5 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9848334975065126 , max(X p-value) = 0.9494025035116354 , rank deficiency = 0 , for 4 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9831568507611748 , max(X p-value) = 0.9224867106787983 , rank deficiency = 0 , for 3 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9796635154891549 , max(X p-value) = 0.8923443478776543 , rank deficiency = 0 , for 2 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9719236846624061 , max(X p-value) = 0.14521457500435586 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price
Restarting from best model (with 6 Xs & Adjusted R² = 0.9862389739676911) found so far...
Adding 6 2-way interactions among 4 untransformed variables in best model found so far:
FTSE_High_Price_x_SPY_Open_Price
FTSE_High_Price_x_US_Covid_Deaths
FTSE_High_Price_x_cumAdmissions
SPY_Open_Price_x_US_Covid_Deaths
SPY_Open_Price_x_cumAdmissions
US_Covid_Deaths_x_cumAdmissions
X pairs with correlations > 0.995 :
FTSE_High_Price_x_US_Covid_Deaths , SPY_Open_Price_x_US_Covid_Deaths
1 variables considered for deletion:
FTSE_High_Price_x_US_Covid_Deaths
X pairs with correlations > 0.995 :
(no more)
1 interaction variables deleted.
Adjusted R² = 0.9865535955200001 , max(X p-value) = 0.995395084439544 , rank deficiency = 1 , for 11 Xs.
Variable to drop: US_Covid_Deaths_x_cumAdmissions
Adjusted R² = 0.9865758847151063 , max(X p-value) = 0.9966536496399322 , rank deficiency = 1 , for 10 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9866228329473213 , max(X p-value) = 0.9987793902726597 , rank deficiency = 1 , for 9 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.986657605429283 , max(X p-value) = 0.9845384272832061 , rank deficiency = 1 , for 8 Xs.
Variable to drop: SPY_Open_Price_x_cumAdmissions
Adjusted R² = 0.9865598994247278 , max(X p-value) = 0.9788807772504783 , rank deficiency = 1 , for 7 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.986435343884349 , max(X p-value) = 0.9790496249093501 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_High_Price_x_cumAdmissions
Adjusted R² = 0.986169748664666 , max(X p-value) = 0.9534197238621716 , rank deficiency = 0 , for 5 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9846997017074276 , max(X p-value) = 0.9561169223923078 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_High_Price_x_SPY_Open_Price
Adjusted R² = 0.9834268822292653 , max(X p-value) = 0.9196092489576535 , rank deficiency = 0 , for 3 Xs.
Variable to drop: SPY_Open_Price_x_US_Covid_Deaths
Adjusted R² = 0.9796635154891549 , max(X p-value) = 0.8923443478776543 , rank deficiency = 0 , for 2 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9719236846624061 , max(X p-value) = 0.14521457500435586 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price
Best model has 6 Xs (Adjusted R² = 0.986435343884349 , rank deficiency = 0):
Results: Generalized linear model
================================================================================
Model: GLM AIC: 6855.5457
Link Function: log BIC: -2016.0774
Dependent Variable: FTSE_Close_Price Log-Likelihood: -3420.8
Date: 2021-04-22 13:47 LL-Null: -3421.9
No. Observations: 351 Deviance: 0.033042
Df Model: 6 Pearson chi2: 0.0328
Df Residuals: 344 Scale: 1.0000
Method: IRLS
--------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------
const 7.7323 3.1737 2.4364 0.0148 1.5120 13.9526
FTSE_High_Price 0.0001 0.0009 0.1387 0.8897 -0.0017 0.0020
FTSE_High_Price_x_SPY_Open_Price 0.0000 0.0000 0.0457 0.9635 -0.0000 0.0000
US_Covid_Deaths_sqar 0.0000 0.0000 0.0627 0.9500 -0.0000 0.0000
SPY_Open_Price_x_US_Covid_Deaths -0.0000 0.0000 -0.0525 0.9581 -0.0000 0.0000
FTSE_High_Price_x_cumAdmissions -0.0000 0.0000 -0.0263 0.9790 -0.0000 0.0000
cumAdmissions_sqrt 0.0002 0.0024 0.0913 0.9273 -0.0045 0.0050
================================================================================
Descending order of 6 X's significance, assuming NegativeBinomial error distribution:
Coefficient z-stat
const 7.732321e+00 2.436379
FTSE_High_Price 1.314279e-04 0.138651
cumAdmissions_sqrt 2.220233e-04 0.091277
US_Covid_Deaths_sqar 3.826540e-13 0.062679
SPY_Open_Price_x_US_Covid_Deaths -8.974774e-10 -0.052502
FTSE_High_Price_x_SPY_Open_Price 7.580649e-08 0.045711
FTSE_High_Price_x_cumAdmissions -2.577576e-11 -0.026260
Rank deficiency = 0: Df Model (6) is same as number of Xs (6).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
NegativeBinomial fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 40.568278576941594
Root Mean Squared Residual = 57.987066282689405
R² = 0.9866678808463315
NegativeBinomial prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 33.88302071770381
Root Mean Squared Error = 47.724854685168026
R² = 0.9905446302957932
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 8
GLM Model Fitted = Tweedie
1 entered.
Assuming Tweedie error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:47 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
Tweedie fit including transformed Xs:
Adjusted R² = 0.10761270518087696 , max(X p-value) = 0.004398125188411098 , rank deficiency = 35 , for 45 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -0.05502827805304489 , max(X p-value) = 0.999482024827365 , rank deficiency = 25 , for 44 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = -inf , max(X p-value) = 0.7756314672635296 , rank deficiency = 22 , for 43 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.7021852164619822 , rank deficiency = 22 , for 42 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.38164885270568827 , max(X p-value) = 0.9816934973000383 , rank deficiency = 24 , for 41 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.3798429854997377 , max(X p-value) = 0.9488073906892044 , rank deficiency = 23 , for 40 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.37694924465064117 , max(X p-value) = 0.8453865665950159 , rank deficiency = 22 , for 39 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.6112581905609482 , rank deficiency = 19 , for 38 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.7126190445201326 , rank deficiency = 18 , for 37 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.950671646499198 , rank deficiency = 17 , for 36 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -0.047166362946634344 , max(X p-value) = 0.9999999783391182 , rank deficiency = 19 , for 35 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -0.038450070814974424 , max(X p-value) = 0.9999999945847572 , rank deficiency = 19 , for 34 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.08066740898607977 , rank deficiency = 15 , for 33 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.28456593287388754 , max(X p-value) = 0.309909743550983 , rank deficiency = 17 , for 32 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.007762254471636899 , rank deficiency = 15 , for 31 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = -0.039012396191582255 , max(X p-value) = 0.9999999490525224 , rank deficiency = 16 , for 30 Xs.
Variable to drop: cumVirusTests
Adjusted R² = -inf , max(X p-value) = 0.5370773892483486 , rank deficiency = 15 , for 29 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.41140279482399156 , max(X p-value) = 0.32160307143818356 , rank deficiency = 15 , for 28 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.41132989268499576 , max(X p-value) = 0.027115601400463964 , rank deficiency = 14 , for 27 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9493617215144389 , max(X p-value) = 0.8598702139145862 , rank deficiency = 5 , for 26 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9495172965341214 , max(X p-value) = 0.03524702010027977 , rank deficiency = 5 , for 25 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9550928079103578 , max(X p-value) = 0.8702583057457753 , rank deficiency = 5 , for 24 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9553766278110001 , max(X p-value) = 0.11729700459048634 , rank deficiency = 5 , for 23 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.955443911976678 , max(X p-value) = 1.7003891957415975e-07 , rank deficiency = 5 , for 22 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.9874287103619765 , max(X p-value) = 0.9609615783054877 , rank deficiency = 2 , for 21 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9874667097960268 , max(X p-value) = 0.8535196896733284 , rank deficiency = 2 , for 20 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9875000670289629 , max(X p-value) = 0.5584428421346406 , rank deficiency = 2 , for 19 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9875322275646254 , max(X p-value) = 0.5056610951671816 , rank deficiency = 2 , for 18 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9875086166047718 , max(X p-value) = 0.21466738839120048 , rank deficiency = 1 , for 17 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.98749402903174 , max(X p-value) = 0.32480285657356434 , rank deficiency = 1 , for 16 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9874587851008431 , max(X p-value) = 0.2221681206857521 , rank deficiency = 0 , for 15 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9874318522946246 , max(X p-value) = 0.21755945721085446 , rank deficiency = 0 , for 14 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.987417227220917 , max(X p-value) = 0.08706798149894139 , rank deficiency = 0 , for 13 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9873360650521253 , max(X p-value) = 0.026850100317834316 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9871859165905457 , max(X p-value) = 0.1861442216693382 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9871640189187697 , max(X p-value) = 0.034527220497368345 , rank deficiency = 0 , for 10 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9870118889915588 , max(X p-value) = 0.6063396282597284 , rank deficiency = 0 , for 9 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9870400954137113 , max(X p-value) = 0.00045093913049284426 , rank deficiency = 0 , for 8 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.986625340381458 , max(X p-value) = 0.038028504334903596 , rank deficiency = 0 , for 7 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9864958664486677 , max(X p-value) = 0.0024745343929827375 , rank deficiency = 0 , for 6 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9861500818529498 , max(X p-value) = 0.012801727892751056 , rank deficiency = 0 , for 5 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9859026112556276 , max(X p-value) = 9.72047491321233e-11 , rank deficiency = 0 , for 4 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9842687836192412 , max(X p-value) = 6.022536470953924e-18 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9810501495095538 , max(X p-value) = 4.1255003745260946e-38 , rank deficiency = 0 , for 2 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9719822109943467 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price
Restarting from best model (with 12 Xs & Adjusted R² = 0.9873360650521253) found so far...
Adding 15 2-way interactions among 6 untransformed variables in best model found so far:
FTSE_High_Price_x_cumCasesByPublishDate
FTSE_High_Price_x_FTSE_Volume
FTSE_High_Price_x_US_Covid_Deaths
FTSE_High_Price_x_Sun_Hours
FTSE_High_Price_x_Rainfall_mm
cumCasesByPublishDate_x_FTSE_Volume
cumCasesByPublishDate_x_US_Covid_Deaths
cumCasesByPublishDate_x_Sun_Hours
cumCasesByPublishDate_x_Rainfall_mm
FTSE_Volume_x_US_Covid_Deaths
FTSE_Volume_x_Sun_Hours
FTSE_Volume_x_Rainfall_mm
US_Covid_Deaths_x_Sun_Hours
US_Covid_Deaths_x_Rainfall_mm
Sun_Hours_x_Rainfall_mm
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.6117456902246827 , max(X p-value) = 0.7674912638958635 , rank deficiency = 7 , for 27 Xs.
Variable to drop: FTSE_Volume_x_Rainfall_mm
Adjusted R² = 0.5809361919424552 , max(X p-value) = 0.5146997646484246 , rank deficiency = 7 , for 26 Xs.
Variable to drop: US_Covid_Deaths_x_Sun_Hours
Adjusted R² = 0.5822044847098968 , max(X p-value) = 2.6988328912301336e-05 , rank deficiency = 7 , for 25 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.5827722872532906 , max(X p-value) = 2.9914719668961206e-25 , rank deficiency = 7 , for 24 Xs.
Variable to drop: US_Covid_Deaths_x_Rainfall_mm
Adjusted R² = 0.5757149205397445 , max(X p-value) = 3.523619623454815e-177 , rank deficiency = 7 , for 23 Xs.
Variable to drop: FTSE_High_Price_x_cumCasesByPublishDate
Adjusted R² = 0.5476083109125858 , max(X p-value) = 1.1235653142284543e-91 , rank deficiency = 7 , for 22 Xs.
Variable to drop: cumCasesByPublishDate_x_US_Covid_Deaths
Adjusted R² = 0.53805413650821 , max(X p-value) = 4.094065597397201e-107 , rank deficiency = 7 , for 21 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.521491061477608 , max(X p-value) = 3.9364336347402475e-168 , rank deficiency = 7 , for 20 Xs.
Variable to drop: FTSE_High_Price_x_US_Covid_Deaths
Adjusted R² = 0.4958019832611603 , max(X p-value) = 3.5791136328110536e-112 , rank deficiency = 7 , for 19 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.481815700616786 , max(X p-value) = 4.715661961050626e-114 , rank deficiency = 7 , for 18 Xs.
Variable to drop: FTSE_Volume_x_US_Covid_Deaths
Adjusted R² = 0.4655017360860819 , max(X p-value) = 1.1659290394989419e-18 , rank deficiency = 7 , for 17 Xs.
Variable to drop: cumCasesByPublishDate_x_FTSE_Volume
Adjusted R² = 0.9854044278384364 , max(X p-value) = 0.7203025649669162 , rank deficiency = 4 , for 16 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.985406705371845 , max(X p-value) = 0.20705591976156368 , rank deficiency = 3 , for 15 Xs.
Variable to drop: FTSE_High_Price_x_FTSE_Volume
Adjusted R² = 0.9861494132854418 , max(X p-value) = 0.6861739172978435 , rank deficiency = 1 , for 14 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9861842032639709 , max(X p-value) = 0.048859333617379194 , rank deficiency = 1 , for 13 Xs.
Variable to drop: cumCasesByPublishDate_x_Sun_Hours
Adjusted R² = 0.9860789550795643 , max(X p-value) = 0.47088136186015117 , rank deficiency = 1 , for 12 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9861003975732404 , max(X p-value) = 0.30990446392219384 , rank deficiency = 1 , for 11 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9860948453619355 , max(X p-value) = 0.25500010420868546 , rank deficiency = 1 , for 10 Xs.
Variable to drop: FTSE_Volume_x_Sun_Hours
Adjusted R² = 0.9860383933824362 , max(X p-value) = 0.08785013513294769 , rank deficiency = 0 , for 9 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9859607275500956 , max(X p-value) = 0.2743415076761513 , rank deficiency = 0 , for 8 Xs.
Variable to drop: Sun_Hours_x_Rainfall_mm
Adjusted R² = 0.985943980564216 , max(X p-value) = 2.4495102383283892e-08 , rank deficiency = 0 , for 7 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9847300372460975 , max(X p-value) = 3.718601433367762e-20 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumCasesByPublishDate_x_Rainfall_mm
Adjusted R² = 0.9810458707242862 , max(X p-value) = 5.659771232307772e-19 , rank deficiency = 0 , for 5 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9764859617276777 , max(X p-value) = 0.006238427982343841 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_High_Price_x_Rainfall_mm
Adjusted R² = 0.9759896303993655 , max(X p-value) = 2.509657418085927e-26 , rank deficiency = 0 , for 3 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9683501947021502 , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.1901564428704755 , max(X p-value) = 4.3624577412865863e-23 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_High_Price_x_Sun_Hours
Best model has 12 Xs (Adjusted R² = 0.9873360650521253 , rank deficiency = 0):
Results: Generalized linear model
==================================================================================
Model: GLM AIC: nan
Link Function: log BIC: -1800.4950
Dependent Variable: FTSE_Close_Price Log-Likelihood: nan
Date: 2021-04-22 13:47 LL-Null: nan
No. Observations: 351 Deviance: 180.45
Df Model: 12 Pearson chi2: 180.
Df Residuals: 338 Scale: 0.53197
Method: IRLS
----------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------
const 7.6075 0.0418 181.9366 0.0000 7.5256 7.6895
FTSE_High_Price 0.0002 0.0000 47.8864 0.0000 0.0001 0.0002
SPY_Open_Price_sqrt 0.0123 0.0028 4.3369 0.0000 0.0067 0.0179
cumCasesByPublishDate 0.0000 0.0000 3.5978 0.0003 0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -4.3536 0.0000 -0.0000 -0.0000
cumAdmissions_sqar -0.0000 0.0000 -3.8977 0.0001 -0.0000 -0.0000
US_Covid_Deaths_sqar 0.0000 0.0000 4.8586 0.0000 0.0000 0.0000
US_Covid_Deaths -0.0000 0.0000 -6.0988 0.0000 -0.0000 -0.0000
Sun_Hours 0.0004 0.0001 3.0680 0.0022 0.0002 0.0007
Rainfall_mm 0.0009 0.0004 2.2137 0.0269 0.0001 0.0017
Rainfall_mm_sqrt -0.0172 0.0069 -2.4802 0.0131 -0.0307 -0.0036
Sun_Hours_sqar -0.0000 0.0000 -3.1866 0.0014 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate_sqrt 0.0002 0.0000 10.0085 0.0000 0.0002 0.0003
==================================================================================
Descending order of 12 X's significance, assuming Tweedie error distribution:
Coefficient z-stat
const 7.607540e+00 181.936645
FTSE_High_Price 1.523792e-04 47.886397
cumDailyNsoDeathsByDeathDate_sqrt 2.280805e-04 10.008548
US_Covid_Deaths -3.371130e-07 -6.098835
US_Covid_Deaths_sqar 4.959323e-13 4.858586
FTSE_Volume -7.723129e-12 -4.353649
SPY_Open_Price_sqrt 1.229833e-02 4.336866
cumAdmissions_sqar -5.194551e-13 -3.897708
cumCasesByPublishDate 1.196678e-08 3.597816
Sun_Hours_sqar -1.767046e-06 -3.186599
Sun_Hours 4.451579e-04 3.068001
Rainfall_mm_sqrt -1.717203e-02 -2.480151
Rainfall_mm 8.849442e-04 2.213690
Rank deficiency = 0: Df Model (12) is same as number of Xs (12).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Tweedie fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 39.49322540480849
Root Mean Squared Residual = 55.53603953998394
R² = 0.987770257107481
Tweedie prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 40.23066289626135
Root Mean Squared Error = 52.62934342455948
R² = 0.9886616964974738
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 9
GLM Model Fitted = GLM Gaussian
1 entered.
Assuming GLM Gaussian error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:47 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
GLM Gaussian fit including transformed Xs:
Adjusted R² = 0.856052762689876 , max(X p-value) = 0.011548938413678775 , rank deficiency = 35 , for 45 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.8561870075230253 , max(X p-value) = 0.0009996748400544321 , rank deficiency = 34 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.8544610841112512 , max(X p-value) = 0.0006437033032813569 , rank deficiency = 33 , for 43 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.8674955696588441 , max(X p-value) = 0.14436933664049328 , rank deficiency = 33 , for 42 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.8670729242368037 , max(X p-value) = 0.0017105758733538484 , rank deficiency = 32 , for 41 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.8695599953045263 , max(X p-value) = 0.0015421993439349005 , rank deficiency = 32 , for 40 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.8685398482080795 , max(X p-value) = 0.11388725361213821 , rank deficiency = 32 , for 39 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.8710740449910657 , max(X p-value) = 0.30507430111984746 , rank deficiency = 32 , for 38 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.8709757125007801 , max(X p-value) = 0.035363829271182534 , rank deficiency = 31 , for 37 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.8703492104067636 , max(X p-value) = 2.8763427615658794e-16 , rank deficiency = 30 , for 36 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.8647269699383138 , max(X p-value) = 0.9166520367365151 , rank deficiency = 29 , for 35 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.8647799764232108 , max(X p-value) = 1.3555841493299327e-09 , rank deficiency = 28 , for 34 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.8612016795978822 , max(X p-value) = 1.935177487408958e-10 , rank deficiency = 28 , for 33 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.8495057819559715 , max(X p-value) = 0.05838404591848423 , rank deficiency = 27 , for 32 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.8517886363042623 , max(X p-value) = 4.0829188178585944e-07 , rank deficiency = 27 , for 31 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.8484139747871491 , max(X p-value) = 6.727615509243839e-75 , rank deficiency = 26 , for 30 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.8108192549475064 , max(X p-value) = 1.3435680777324955e-126 , rank deficiency = 26 , for 29 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.7200292732044025 , max(X p-value) = 0.0 , rank deficiency = 26 , for 28 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.9928451020967469 , max(X p-value) = 0.9468516272429597 , rank deficiency = 16 , for 27 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9928451662096838 , max(X p-value) = 0.9112557588868884 , rank deficiency = 15 , for 26 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9928694724335335 , max(X p-value) = 0.987823038502728 , rank deficiency = 15 , for 25 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9928695789431854 , max(X p-value) = 0.9839290052850201 , rank deficiency = 14 , for 24 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.992869513227482 , max(X p-value) = 0.990732997456319 , rank deficiency = 13 , for 23 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.992869575035061 , max(X p-value) = 0.8578480412786975 , rank deficiency = 12 , for 22 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9928695932276799 , max(X p-value) = 0.7768450230314637 , rank deficiency = 11 , for 21 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9928888408034632 , max(X p-value) = 0.8005980966453211 , rank deficiency = 11 , for 20 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9928888127161739 , max(X p-value) = 0.6801103279519898 , rank deficiency = 10 , for 19 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9928889458325815 , max(X p-value) = 0.5329894105487227 , rank deficiency = 9 , for 18 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9928816779689159 , max(X p-value) = 0.6973005183628291 , rank deficiency = 8 , for 17 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.992900692175673 , max(X p-value) = 0.7633370103742867 , rank deficiency = 8 , for 16 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9928987942296852 , max(X p-value) = 0.4489655746137884 , rank deficiency = 7 , for 15 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9928876612153869 , max(X p-value) = 0.5015384195314583 , rank deficiency = 6 , for 14 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9928777159892562 , max(X p-value) = 0.18561441134851298 , rank deficiency = 5 , for 13 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9928626642418298 , max(X p-value) = 0.07187490508582962 , rank deficiency = 5 , for 12 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9927964318243934 , max(X p-value) = 0.15810477343761764 , rank deficiency = 4 , for 11 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9927534488139275 , max(X p-value) = 0.8183879429297852 , rank deficiency = 3 , for 10 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9927735872808554 , max(X p-value) = 0.038354855145061206 , rank deficiency = 3 , for 9 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.9926792969907646 , max(X p-value) = 0.20546943919201976 , rank deficiency = 0 , for 8 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.992666425686054 , max(X p-value) = 0.1706379127251929 , rank deficiency = 0 , for 7 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9926477222985559 , max(X p-value) = 0.6509857441687742 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9926646717577661 , max(X p-value) = 0.07734271652895466 , rank deficiency = 0 , for 5 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9926197296220952 , max(X p-value) = 0.18707146930160945 , rank deficiency = 0 , for 4 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9926039794080767 , max(X p-value) = 1.2130206336483614e-24 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9903933859344677 , max(X p-value) = 1.769075275572336e-14 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9888031058125417 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Restarting from best model (with 8 Xs & Adjusted R² = 0.9926792969907646) found so far...
Adding 10 2-way interactions among 5 untransformed variables in best model found so far:
FTSE_Low_Price_x_FTSE_High_Price
FTSE_Low_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_US_Covid_Cases
FTSE_Low_Price_x_Rainfall_mm
FTSE_High_Price_x_FTSE_Open_Price
FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_Rainfall_mm
FTSE_Open_Price_x_US_Covid_Cases
FTSE_Open_Price_x_Rainfall_mm
US_Covid_Cases_x_Rainfall_mm
X pairs with correlations > 0.995 :
US_Covid_Cases_x_Rainfall_mm , US_Covid_Cases
FTSE_Open_Price_x_US_Covid_Cases , FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases , FTSE_Low_Price_x_US_Covid_Cases
FTSE_Open_Price , FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price , FTSE_Low_Price_x_FTSE_High_Price
5 variables considered for deletion:
US_Covid_Cases_x_Rainfall_mm
FTSE_Open_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price
X pairs with correlations > 0.995 :
FTSE_High_Price , FTSE_Low_Price_x_FTSE_High_Price
1 variables considered for deletion:
FTSE_Low_Price_x_FTSE_High_Price
X pairs with correlations > 0.995 :
(no more)
6 interaction variables deleted.
Adjusted R² = 0.9928555140083525 , max(X p-value) = 0.882804646087142 , rank deficiency = 1 , for 12 Xs.
Variable to drop: FTSE_Low_Price_x_US_Covid_Cases
Adjusted R² = 0.9928550560214098 , max(X p-value) = 0.8753664384686102 , rank deficiency = 0 , for 11 Xs.
Variable to drop: FTSE_High_Price_x_Rainfall_mm
Adjusted R² = 0.9928755535926179 , max(X p-value) = 0.2901012680219056 , rank deficiency = 0 , for 10 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9928730641871576 , max(X p-value) = 0.13406734366786754 , rank deficiency = 0 , for 9 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9928471245538578 , max(X p-value) = 0.12428831062238879 , rank deficiency = 0 , for 8 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9928187125870475 , max(X p-value) = 0.5426019592392081 , rank deficiency = 0 , for 7 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9928318489451495 , max(X p-value) = 0.105715109088978 , rank deficiency = 0 , for 6 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.992798249312866 , max(X p-value) = 0.0009634675679838689 , rank deficiency = 0 , for 5 Xs.
Variable to drop: FTSE_Open_Price_x_Rainfall_mm
Adjusted R² = 0.9925922607588152 , max(X p-value) = 0.5018298892588449 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Low_Price_x_Rainfall_mm
Adjusted R² = 0.9926039794080767 , max(X p-value) = 1.2130206336483614e-24 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9903933859344677 , max(X p-value) = 1.769075275572336e-14 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9888031058125417 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Best model has 5 Xs (Adjusted R² = 0.992798249312866 , rank deficiency = 0):
Results: Generalized linear model
==============================================================================
Model: GLM AIC: 3637.1301
Link Function: identity BIC: 626367.2321
Dependent Variable: FTSE_Close_Price Log-Likelihood: -1812.6
Date: 2021-04-22 13:47 LL-Null: -25940.
No. Observations: 351 Deviance: 6.2839e+05
Df Model: 5 Pearson chi2: 6.28e+05
Df Residuals: 345 Scale: 1821.4
Method: IRLS
------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 71.7888 32.9569 2.1783 0.0294 7.1945 136.3830
FTSE_Low_Price 1.5819 0.2265 6.9852 0.0000 1.1380 2.0258
FTSE_High_Price 0.5187 0.0389 13.3294 0.0000 0.4424 0.5949
FTSE_Open_Price -1.0918 0.2213 -4.9343 0.0000 -1.5255 -0.6581
FTSE_Low_Price_x_Rainfall_mm -0.0096 0.0029 -3.3124 0.0009 -0.0153 -0.0039
FTSE_Open_Price_x_Rainfall_mm 0.0094 0.0028 3.3010 0.0010 0.0038 0.0150
==============================================================================
Descending order of 5 X's significance, assuming GLM Gaussian error distribution:
Coefficient z-stat
const 71.788762 2.178264
FTSE_High_Price 0.518661 13.329430
FTSE_Low_Price 1.581903 6.985237
FTSE_Open_Price -1.091824 -4.934336
FTSE_Low_Price_x_Rainfall_mm -0.009619 -3.312433
FTSE_Open_Price_x_Rainfall_mm 0.009390 3.300983
Rank deficiency = 0: Df Model (5) is same as number of Xs (5).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
GLM Gaussian fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 29.146367997189703
Root Mean Squared Residual = 42.31173159530557
R² = 0.9929011314655393
GLM Gaussian prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 27.01758016799927
Root Mean Squared Error = 38.869443002350124
R² = 0.9942506743907702
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 10
GLM Model Fitted = Normal
1 entered.
Assuming Normal error distribution.
X pairs with correlations > 0.995 :
SPY_Close_Price , SPY_Open_Price
1 variables considered for deletion:
SPY_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 variables deleted.
X pairs with correlations > 0.995 :
FTSE_Open_Price_sqar , FTSE_Open_Price_sqrt
FTSE_Open_Price_sqrt , FTSE_Open_Price
FTSE_High_Price_sqar , FTSE_High_Price
FTSE_High_Price , FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt , FTSE_Low_Price_sqar
FTSE_Low_Price_sqar , FTSE_Low_Price
6 variables considered for deletion:
FTSE_Open_Price_sqar
FTSE_Open_Price_sqrt
FTSE_High_Price_sqar
FTSE_High_Price_sqrt
FTSE_Low_Price_sqrt
FTSE_Low_Price_sqar
X pairs with correlations > 0.995 :
(no more)
6 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 17 untransformed Xs:
Results: Ordinary least squares
============================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3643.1592
Date: 2021-04-22 13:47 BIC: 3712.6534
No. Observations: 351 Log-Likelihood: -1803.6
Df Model: 17 F-statistic: 2885.
Df Residuals: 333 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1792.9
--------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept 231.5217 111.9089 2.0688 0.0393 11.3842 451.6592
FTSE_Low_Price 0.7842 0.0457 17.1672 0.0000 0.6943 0.8740
FTSE_High_Price 0.6129 0.0549 11.1584 0.0000 0.5048 0.7209
FTSE_Open_Price -0.3757 0.0363 -10.3364 0.0000 -0.4472 -0.3042
SPY_Open_Price -0.6321 0.3893 -1.6237 0.1054 -1.3978 0.1337
cumCasesByPublishDate -0.0001 0.0000 -1.2514 0.2117 -0.0002 0.0000
FTSE_Volume -0.0000 0.0000 -2.5064 0.0127 -0.0000 -0.0000
US_Covid_Cases -0.0000 0.0000 -1.1749 0.2409 -0.0000 0.0000
cumVirusTests 0.0000 0.0000 2.0727 0.0390 0.0000 0.0000
US_Covid_Deaths -0.0003 0.0005 -0.5383 0.5907 -0.0013 0.0008
cumAdmissions 0.0018 0.0012 1.4977 0.1351 -0.0006 0.0042
cumPeopleVaccinatedCompleteByPublishDate -0.0000 0.0000 -2.0560 0.0406 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate -0.0026 0.0027 -0.9650 0.3352 -0.0078 0.0027
Unnamed__0 0.1010 0.2194 0.4602 0.6457 -0.3307 0.5326
Sun_Hours 0.4980 0.3462 1.4383 0.1513 -0.1831 1.1791
Max_Temperature_DegC -27.0565 12.0258 -2.2499 0.0251 -50.7126 -3.4004
Rainfall_mm -0.5798 0.5001 -1.1594 0.2471 -1.5635 0.4039
Min_Temperature_DegC 30.7883 12.9053 2.3857 0.0176 5.4022 56.1745
--------------------------------------------------------------------------------------------
Omnibus: 41.833 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 248.624
Skew: 0.195 Prob(JB): 0.000
Kurtosis: 7.105 Condition No.: 49810797143
============================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 17 X's significance, assuming Normal error distribution:
FTSE_Low_Price
FTSE_High_Price
FTSE_Open_Price
FTSE_Volume
Min_Temperature_DegC
Max_Temperature_DegC
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
SPY_Open_Price
cumAdmissions
Sun_Hours
cumCasesByPublishDate
US_Covid_Cases
Rainfall_mm
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
Unnamed__0
Rank deficiency = 0: Df Model (17) is same as number of Xs (17).
Normal fit including transformed Xs:
Adjusted R² = 0.826607683992185 , max(X p-value) = 0.25796878843555066 , rank deficiency = 32 , for 45 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.8275973276398689 , max(X p-value) = 0.21735522380657735 , rank deficiency = 31 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.8250303533702403 , max(X p-value) = 0.10017061843900948 , rank deficiency = 30 , for 43 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.8481713995272524 , max(X p-value) = 0.292816358485037 , rank deficiency = 30 , for 42 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.8475613118694245 , max(X p-value) = 0.030740949362100967 , rank deficiency = 29 , for 41 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.849944337764815 , max(X p-value) = 0.19967631849630305 , rank deficiency = 29 , for 40 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.8510476355192589 , max(X p-value) = 0.24083263348139305 , rank deficiency = 29 , for 39 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.853192769760013 , max(X p-value) = 0.28671412423223436 , rank deficiency = 29 , for 38 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.8532394736969164 , max(X p-value) = 0.16777531393277273 , rank deficiency = 28 , for 37 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.8512359747705537 , max(X p-value) = 5.706183218060886e-08 , rank deficiency = 27 , for 36 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.8418231696904797 , max(X p-value) = 0.925646924043702 , rank deficiency = 26 , for 35 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.8417830683319295 , max(X p-value) = 0.012284386194111353 , rank deficiency = 25 , for 34 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.8392850213406523 , max(X p-value) = 1.320388438357376e-08 , rank deficiency = 25 , for 33 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.8225696834188813 , max(X p-value) = 0.07582257411566104 , rank deficiency = 25 , for 32 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.822450470676079 , max(X p-value) = 0.0008237513108627133 , rank deficiency = 25 , for 31 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.8171605257593518 , max(X p-value) = 1.9148664411599767e-14 , rank deficiency = 25 , for 30 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.7845232112604724 , max(X p-value) = 4.8514373487687905e-28 , rank deficiency = 25 , for 29 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.6942929828445694 , max(X p-value) = 1.5612756339242417e-21 , rank deficiency = 25 , for 28 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.9927161211382474 , max(X p-value) = 0.9476149041648358 , rank deficiency = 10 , for 27 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9927379957475767 , max(X p-value) = 0.9123044436910306 , rank deficiency = 10 , for 26 Xs.
Variable to drop: SPY_Open_Price_sqar
Adjusted R² = 0.9927410838674037 , max(X p-value) = 0.9879811551418602 , rank deficiency = 9 , for 25 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9927412135415726 , max(X p-value) = 0.9841442898537537 , rank deficiency = 8 , for 24 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9927628460234671 , max(X p-value) = 0.9908434757602849 , rank deficiency = 8 , for 23 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9927628965373916 , max(X p-value) = 0.8595330448970508 , rank deficiency = 7 , for 22 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9927629131652195 , max(X p-value) = 0.7794559846361968 , rank deficiency = 6 , for 21 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9927827033435752 , max(X p-value) = 0.8029367283925661 , rank deficiency = 6 , for 20 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9927826907031152 , max(X p-value) = 0.6837597313876718 , rank deficiency = 5 , for 19 Xs.
Variable to drop: SPY_Open_Price_sqrt
Adjusted R² = 0.9927827986446083 , max(X p-value) = 0.5377814072605769 , rank deficiency = 4 , for 18 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9927971661141927 , max(X p-value) = 0.7002303075176355 , rank deficiency = 4 , for 17 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.9928166356235602 , max(X p-value) = 0.7651969475410724 , rank deficiency = 4 , for 16 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9928147317440645 , max(X p-value) = 0.4528238909384099 , rank deficiency = 3 , for 15 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9928239685351828 , max(X p-value) = 0.504383150426027 , rank deficiency = 3 , for 14 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9928145136421224 , max(X p-value) = 0.18910416899771135 , rank deficiency = 2 , for 13 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9927990640540123 , max(X p-value) = 0.07473910069592911 , rank deficiency = 2 , for 12 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = 0.9927527170941544 , max(X p-value) = 0.1612682310287071 , rank deficiency = 2 , for 11 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9927321623793126 , max(X p-value) = 0.8190658858234556 , rank deficiency = 2 , for 10 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9927522343700771 , max(X p-value) = 0.039702510766006394 , rank deficiency = 2 , for 9 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.9926792969907643 , max(X p-value) = 0.2063311762565667 , rank deficiency = 0 , for 8 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.992666425686054 , max(X p-value) = 0.17153404683707957 , rank deficiency = 0 , for 7 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9926477222985566 , max(X p-value) = 0.6512708938482358 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9926646717577665 , max(X p-value) = 0.0782270057607111 , rank deficiency = 0 , for 5 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9926197296220962 , max(X p-value) = 0.18794384119627083 , rank deficiency = 0 , for 4 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9926039794080762 , max(X p-value) = 1.0538500089819825e-21 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9903933859344672 , max(X p-value) = 1.7835644588729087e-13 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9888031058125419 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Restarting from best model (with 9 Xs & Adjusted R² = 0.9927522343700771) found so far...
Adding 10 2-way interactions among 5 untransformed variables in best model found so far:
FTSE_Low_Price_x_FTSE_High_Price
FTSE_Low_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_US_Covid_Cases
FTSE_Low_Price_x_Rainfall_mm
FTSE_High_Price_x_FTSE_Open_Price
FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_Rainfall_mm
FTSE_Open_Price_x_US_Covid_Cases
FTSE_Open_Price_x_Rainfall_mm
US_Covid_Cases_x_Rainfall_mm
X pairs with correlations > 0.995 :
US_Covid_Cases_x_Rainfall_mm , US_Covid_Cases
FTSE_Open_Price_x_US_Covid_Cases , FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases , FTSE_Low_Price_x_US_Covid_Cases
FTSE_Open_Price , FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price , FTSE_Low_Price_x_FTSE_High_Price
5 variables considered for deletion:
US_Covid_Cases_x_Rainfall_mm
FTSE_Open_Price_x_US_Covid_Cases
FTSE_High_Price_x_US_Covid_Cases
FTSE_High_Price_x_FTSE_Open_Price
FTSE_Low_Price_x_FTSE_Open_Price
X pairs with correlations > 0.995 :
FTSE_High_Price , FTSE_Low_Price_x_FTSE_High_Price
1 variables considered for deletion:
FTSE_Low_Price_x_FTSE_High_Price
X pairs with correlations > 0.995 :
(no more)
6 interaction variables deleted.
Adjusted R² = 0.9929062688131765 , max(X p-value) = 0.91187728490757 , rank deficiency = 2 , for 13 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9929269367246852 , max(X p-value) = 0.6645858522393218 , rank deficiency = 2 , for 12 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.992943753375399 , max(X p-value) = 0.251083358248719 , rank deficiency = 2 , for 11 Xs.
Variable to drop: FTSE_Open_Price_x_Rainfall_mm
Adjusted R² = 0.9929371513431888 , max(X p-value) = 0.08362210896937489 , rank deficiency = 2 , for 10 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9929453349240146 , max(X p-value) = 0.12126963637580654 , rank deficiency = 2 , for 9 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9928955763004006 , max(X p-value) = 0.1773418689354735 , rank deficiency = 1 , for 8 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9928785901976791 , max(X p-value) = 0.01654210473946162 , rank deficiency = 1 , for 7 Xs.
Variable to drop: US_Covid_Cases_sqar
Adjusted R² = 0.9927663158595111 , max(X p-value) = 0.18090162859845127 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_Low_Price_x_US_Covid_Cases
Adjusted R² = 0.9927495948884503 , max(X p-value) = 0.5525036087497635 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9927631413308635 , max(X p-value) = 7.166406152903059e-23 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9904470118677471 , max(X p-value) = 6.138440916282818e-14 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Low_Price_x_Rainfall_mm
Adjusted R² = 0.988793082498854 , max(X p-value) = 0.4074620353997429 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price_x_Rainfall_mm
Adjusted R² = 0.9888031058125419 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: FTSE_Low_Price
Best model has 6 Xs (Adjusted R² = 0.9927663158595111 , rank deficiency = 0):
Results: Ordinary least squares
==================================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: FTSE_Close_Price AIC: 3639.6641
Date: 2021-04-22 13:47 BIC: 3666.6896
No. Observations: 351 Log-Likelihood: -1812.8
Df Model: 6 F-statistic: 8007.
Df Residuals: 344 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 1829.5
----------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 27.4387 44.8441 0.6119 0.5410 -60.7644 115.6419
FTSE_Low_Price 1.3940 0.0385 36.1919 0.0000 1.3182 1.4698
FTSE_Open_Price -0.3814 0.0359 -10.6289 0.0000 -0.4520 -0.3109
FTSE_Low_Price_x_Rainfall_mm -0.0075 0.0006 -12.8477 0.0000 -0.0087 -0.0064
FTSE_High_Price_x_Rainfall_mm 0.0073 0.0006 12.8643 0.0000 0.0062 0.0084
FTSE_Low_Price_x_US_Covid_Cases -0.0000 0.0000 -1.3407 0.1809 -0.0000 0.0000
cumCasesByPublishDate_sqrt 0.0305 0.0215 1.4225 0.1558 -0.0117 0.0728
----------------------------------------------------------------------------------
Omnibus: 40.711 Durbin-Watson: 1.962
Prob(Omnibus): 0.000 Jarque-Bera (JB): 263.904
Skew: -0.010 Prob(JB): 0.000
Kurtosis: 7.248 Condition No.: 1810167996023
==================================================================================
* The condition number is large (2e+12). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 6 X's significance, assuming Normal error distribution:
Coefficient z-stat
Intercept 2.743875e+01 0.611870
FTSE_Low_Price 1.393996e+00 36.191877
FTSE_High_Price_x_Rainfall_mm 7.321943e-03 12.864254
FTSE_Low_Price_x_Rainfall_mm -7.535470e-03 -12.847734
FTSE_Open_Price -3.814345e-01 -10.628935
cumCasesByPublishDate_sqrt 3.054065e-02 1.422510
FTSE_Low_Price_x_US_Covid_Cases -2.897631e-10 -1.340703
Rank deficiency = 0: Df Model (6) is same as number of Xs (6).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Normal fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 29.066173911422652
Root Mean Squared Residual = 42.34393381042232
R² = 0.9928903218733488
Normal prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 26.7707801761965
Root Mean Squared Error = 38.27870342033717
R² = 0.9944462291696584
Plots of train-set fit & test-set predict:
## Generalized Linear Model (GLM) with transformed variables & interaction variables (collinearity issue considered)
#For comparing Normal & GLM error distributions.
#algorithm is inefficient but clearer for instructional purposes
#favors fit with Xs' p-values < 0.05, and smaller rank-deficiency (number_of_Xs - Df_Model)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
import pandas as pd
#set maximum window width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) #change code window width to max
pd.options.display.max_columns = 0 #change output window width to max
counter = 1
#Iterate through 0 to 10, iterate through the GLM models
for i in range(0,10):
print("\n \n \n \n \n ")
print("========================================================== Next GLM Model ==============================================================")
print("\n \n \n \n \n ")
print("counter = ",counter)
m2 = str(counter)
counter += 1
# 1 out of the 2 following lines should be active:
# d = '' #means nanes96 will be used
d = 1 #means use specified dataset instead of the default nanes96: d=''
if d == '':
pass
else:
#d = 1 case:
#Use cleaned_data
dt = df_clean.copy() #specifying 1st column as row labels (called 'index')
#handle special variabes
yname = 'SPY_Close_Price' #target variable FTSE CLose Price
import regex as re
dt.rename(columns=lambda x: re.sub('\W', '_', x), inplace=True) #replace ' ' by '_' in variable names
yname = re.sub('\W', '_', yname)
#delete any row with any missing value
dt.dropna(how='any', inplace=True)
if yname != dt.columns[0]:
dt = dt[[yname] + list(dt.columns.drop(yname))]
#m2 = input('1: OLS (default), 2: Poisson, 3: Logit, 4: Probit, 5: Gamma, 6: InverseGaussian, 7: NegativeBinomial,' +
#' 8: Tweedie, 9: Gaussian? ').strip()
if m2 == '2':
#https://www.statsmodels.org/stable/glm.html
m = 'Poisson'
dist = sm.families.Poisson()
elif m2 == '3':
m = 'Logit'
dist = sm.families.Binomial()
elif m2 == '4':
m = 'Probit'
#http://web.pdx.edu/~crkl/ceR/Python/example8_1.py
dist = sm.families.Binomial(sm.genmod.families.links.probit)
elif m2 == '5':
m = 'Gamma'
dist = sm.families.Gamma()
elif m2 == '6':
m = 'InverseGaussian'
dist = sm.families.InverseGaussian()
elif m2 == '7':
m = 'NegativeBinomial'
dist = sm.families.NegativeBinomial()
elif m2 == '8':
m = 'Tweedie'
dist = sm.families.Tweedie()
elif m2 == '9':
m = 'GLM Gaussian'
dist = sm.families.Gaussian()
else:
# m2 == 1 or ''
m = 'Normal'
print('GLM Model Fitted = ',m)
ytype = type('a')
if m == 'Normal':
#there is a version of OLS that also requires endog and exog like GLM:
#http://statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html
#df has y followed by Xs:
if d == '':
df = pd.concat([dt.endog, dt.exog], axis=1) #a DataFrame
else:
df = dt
y = yname
else:
#df will not have y, but a column of 1s:
if d == '':
df = sm.add_constant(dt.exog) #add column of 1 to the left of dt.exog to form DataFrame
y = dt.endog #PID: 0 to 6: Party IDentification of respondent (shades of Democrat or Republican)
else:
df = sm.add_constant(dt.iloc[:, 1:])
y = dt.iloc[:, 0]
ytype = type(y)
#m3 = input('1: Reproducible output (input any integer except 2), 2: Random train-test data split [default: 1]? ').strip()
m3 = 1
if m3 != '2':
try:
if m3 == '':
m3 = 1
else:
m3 = int(m3)
print(m3, 'entered.')
except:
m3 = 1 #can be changed to any integer for reproducible randomization
print(m3, 'assumed.')
print('\nAssuming', m, 'error distribution.')
def delcorr(df):
#delete any x too highly correlated with another x, to avoid collinearity
#corr(Xs, y) ranked:
# corv = pd.DataFrame() #start empty dataframe for corr(Xs, y) to come
# for x in list(df)[1:]:
# #during 1st time thru loop: new column, with label, created in empty dataframe:
# #during subsequent time thru loop: new row, with row label, added to dataframe:
# corv.loc[x, yname] = df[x].corr(df[yname] if m == 'Normal' else y)
# corv = corv.loc[abs(corv).sort_values([yname]).index, :] #corr(Xs, y) ranked
corv = df.iloc[:, 1:].corrwith(df[yname] if m == 'Normal' else y).rename(yname).sort_values(key=abs).to_frame()
delta = 0.005 #corr difference lower limit
dl2 = []
icorr = True
while icorr:
a = abs(corv).diff() <= delta #adjacent rows with similar abs(corr(Xs, y))
colname = list(df)[1:]
dl = []
print('\nX pairs with correlations >', 1 - delta, ':')
for b in range(1, a.shape[0]):
if a.iloc[b, 0]:
if abs(df[a.index[b - 1]].corr(df[a.index[b]])) > 1 - delta:
#deleting 1 X from correlated pair:
dv0 = a.index[b - 1]
dv1 = a.index[b]
#neither should already be deleted:
if not (dv0 in dl) and not (dv1 in dl):
#delete x with rather lower corr(x, y):
if abs(corv.loc[dv0, y if type(y) == type('a') else y.name]
) - abs(corv.loc[dv1, y if type(y) == type('a') else y.name]) >= delta:
d = dv1
elif len(dv0) < len(dv1): #delete x with longer name:
d = dv1
else:
d = dv0
dl.append(d) #for en masse deletion later
corv.drop([d], axis=0, inplace=True) #delete from column of corr with y
print(dv0,',',dv1)
if len(dl) > 0:
df.drop(axis=1, columns=dl, inplace=True) #variables deleted en masse
dl2 = dl2 + dl #keep for real deletion later
print('\n' + str(len(dl)), 'variables considered for deletion:')
print('\n'.join([str(x) for x in dl]))
else:
print('(no more)')
icorr = False
return dl2
#delete collinear Xs:
dl2 = delcorr(df)
#df.drop(axis=1, columns=dl2, inplace=True) #collinear Xs deleted en masse #not necessary since df operated on directly
if len(dl2) > 0:
print('\n' + str(len(dl2)) + ' variables deleted.')
#transform all Xs into either square & square-root or cube & cube-root using np.cbrt()
trf = ['_sqar', '_sqrt', '_cube', '_cbrt']
import numpy as np
for i in list(df)[1:]:
#excluded either y or column of 1s
failed = False
try:
#searching for -ve values:
df[i + trf[1]] = np.sqrt(df[i])
if df[i + trf[1]].isnull().any():
#bug reported by Sharifah
failed = True
del df[i + trf[1]]
else:
df[i + trf[0]] = df[i] ** 2.
except:
failed = True
if failed:
try:
#searching for non-numeric
df[i + trf[2]] = df[i] ** 3.
df[i + trf[3]] = np.cbrt(df[i])
except:
#column cannot be transformed
#delete non-numeric column (with no questions asked!):
df.drop(i, axis=1, inplace=True) #remove any row with any NaN
#only numeric columns left
#delete collinear Xs:
df0 = df.copy()
dl2 = delcorr(df0)
dl2 = [x for x in dl2 if x[-5:] in trf]
if len(dl2) > 0:
df.drop(axis=1, columns=dl2, inplace=True) #collinear transformed variables deleted en masse
print('\n' + str(len(dl2)) + ' transformed variables deleted.')
from sklearn.model_selection import train_test_split
#split into training & testing sets
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 1:] if m == 'Normal' else df,
df[y] if m == 'Normal' else y, test_size=.2,
random_state=(None if m3=='2' else m3)) #set to an integer (here m3) to get reproducible output
#sort columns by absolute correlation with y, so may preferably delete last column if regression fails:
x_train = x_train[x_train.corrwith(y_train).sort_values(ascending=False, na_position='first', key=abs).index]
print('\nFit using', ('' if m3 == '2' else 'reproducible ') + 'random 80% (x_train & y_train) of data rows:')
#first do OLS on untransformed Xs:
df = pd.concat([y_train, x_train if m == 'Normal' else x_train.iloc[:, 1:]], axis=1)
xpure = [x for x in list(df) if x[-5:] not in trf]
numx = len(xpure) - 1
print('\nOLS fit including only', numx, 'untransformed Xs:')
#initialize for adj-R2:
ddf = np.inf #deficiency in degree of freedom = rank deficiency
maxR2 = -np.inf
bmodeleq = ''
bic0 = np.inf #bic kept by lowest overall rank deficiency
bic5 = bic0 #bic kept by best model with p-values < 0.05
bicd = bic0 #bic kept by lowest rank deficiency
bbic = bic0 #best bic kept by adj-R2
p05 = False #once found all Xs' p-values < 0.05
modeleq = ' + '.join(list(xpure)).replace('+', '~', 1)
from statsmodels.formula.api import ols
try:
out = ols(modeleq, df).fit()
print(out.summary2())
if numx > 1:
print("\nDescending order of", numx, "X's significance, assuming Normal error distribution:")
print('\n'.join(list(abs(out.tvalues[1:]).sort_values(0, ascending=False).index)))
#if the single best variable isn't high in above ranking, collinearity might be an issue
dfm = int(out.df_model)
ddf = numx - dfm #rank deficiency
maxR2 = out.rsquared_adj
bmodeleq = modeleq
print('\n' + 'Rank deficiency =', str(ddf) + ': Df Model (' + str(dfm) + ') is',
('less than' if ddf > 0 else 'same as'), 'number of Xs (' + str(numx) + ').')
except:
pass
print('\n'+ m, 'fit including transformed Xs:')
if m != 'Normal':
#undo above ols:
ddf = np.inf
maxR2 = -np.inf
bmodeleq = ''
df = x_train
y = y_train
#initialize for rank deficiency:
ddfd = ddf #best rank deficiency
R2df = maxR2 #R2 for best rank deficiency
modeleqdf = bmodeleq #modeleq for best rank deficiency
#initialize for overall rank deficiency
ddf0 = ddfd
R2df0 = R2df
modeleqdf0 = modeleqdf
bddf = ddfd #rank deficiency for best adj-R2 model
#initialize for z-stat p-values < 0.05:
ddf5 = np.inf #rank deficiency for best model with p-values < 0.05
R205 = -np.inf #adj-R2 for best model with p-values < 0.05
modeleq05 = '' #modeleq for best model with p-values < 0.05
df0 = df.copy() #kept for inclusion of interaction variables later
#perform feature selection using adjusted R2
#model equation actually not used by GLM:
modeleq = ' + '.join(list(df)).replace('+', '~', 1)
#print(modeleq)
numx = df.shape[1] - 1
x1x2 = False #interaction variables not yet included
while True:
if m == 'Normal':
#https://www.statsmodels.org/stable/generated/statsmodels.formula.api.ols.html
out = ols(modeleq, df).fit()
R2 = out.rsquared_adj
else:
#GLM distribution
try:
#https://www.statsmodels.org/stable/generated/statsmodels.genmod.generalized_linear_model.GLM.html
out = sm.GLM(y, df, family=dist).fit()
R2 = 1 - (1 - y.corr(out.fittedvalues)**2) * (out.nobs - 1) / out.df_resid #pseudo adjusted r2
if R2 != R2 and out.fittedvalues.isna().sum() == 0:
R2 = -np.inf
#R2 = -out.bic
except:
#GLM failed! do ols for this round, just to delete 1 x:
try:
out = ols(' + '.join(list(df)).replace('+', '~', 1), df).fit() #do OLS instead
except:
pass
R2 = -np.inf
try:
maxp = max(out.pvalues[1:])
dfm = int(out.df_model)
except:
maxp = 1
dfm = 0
ddf = numx - dfm #rank deficiency
#see if a better model is found:
try:
if R2 >= maxR2 and out.fittedvalues.isna().sum() == 0:
maxR2 = R2
bmodeleq = modeleq
bddf = min(bddf, ddf)
ddf0 = ddf #best overall rank deficiency
if maxR2 == -np.inf:
bbic = out.bic
if maxp >= 0.05 and not p05:
#reset z-stat p-value criterion:
R205 = -np.inf
modeleq05 = ''
#reset rank deficiency criterion:
R2df = R205
modeleqdf = ''
ddfd = bddf #reset deficient df
else:
p05 = True
#if m != 'Normal':
# df1 = df.copy()
#see if a model is found with reduced overall rank deficiency:
if ddf < ddf0 or (ddf == ddf0 and R2 > R2df0):
R2df0 = R2
modeleqdf0 = modeleq
ddf0 = ddf #best overall rank deficiency
if maxR2 == -np.inf:
bic0 = out.bic
#see if a better model is found with max(z-stat p-value) < .05:
if maxp < .05 and (R2 > R205 or modeleq05 == ''):
R205 = R2
modeleq05 = modeleq
ddf5 = min(ddf5, ddf) #rank deficiency for best model with p-values < .05
if maxR2 == -np.inf:
bic5 = out.bic
#see if a model is found with reduced rank-deficiency:
if ddf < ddfd or (ddf == ddfd and R2 > R2df):
R2df = R2
modeleqdf = modeleq
ddfd = min(ddfd, ddf) #best rank deficiency
if maxR2 == -np.inf:
bicd = out.bic
except:
pass
print('\nAdjusted R² =', R2, ', max(X p-value) =', maxp, ', rank deficiency =', ddf, ', for', numx, 'Xs.')
if numx == 1:
print('Variable left:', modeleq[modeleq.find('~') + 2 :])
if x1x2:
#one xvar left
#get out of 'while' loop:
break
else:
if maxR2 == -np.inf and out.fittedvalues.isna().sum() > 0:
print('\n*** Y variable', yname if m == 'Normal' else y.name, 'might not work with', m, 'distribution.')
#use all Xs before deletion:
bmodeleq = ' + '.join(list(df0)).replace('+', '~', 1)
else:
#see if best model with all z-stat p-values < 0.05 is smaller than best model by adjusted R2:
if (R205 > -np.inf and len(modeleq05) < len(bmodeleq)) or (
R205 == -np.inf and (maxR2 == -np.inf or (len(modeleq05) > 0 and len(modeleq05) < len(bmodeleq)))):
bmodeleq = modeleq05
maxR2 = R205
#bddf = min(bddf, ddf5)
bddf = ddf5
if maxR2 == -np.inf:
bmodeleq0 = bmodeleq
bddf0 = bddf
bbic = bic5 #best bic
bic5 = np.inf #re-initialize
#see if model with smallest rank-deficiency is smaller than best model so far:
if (R2df > -np.inf and len(modeleqdf) < len(bmodeleq)) or (
R2df == -np.inf and (maxR2 == -np.inf or (len(modeleqdf) > 0 and len(modeleqdf) < len(bmodeleq)))):
bmodeleq = modeleqdf
maxR2 = R2df
#bddf = min(bddf, ddfd)
bddf = ddfd
if maxR2 == -np.inf:
bmodeleq0 = bmodeleq
bddf0 = bddf
bbic = bicd #best bic
bicd = np.inf #re-initialize
if maxR2 == -np.inf:
#reset z-stat p-value criterion:
R205 = -np.inf
modeleq05 = ''
#reset rank deficiency criterion:
R2df = R205
modeleqdf = ''
ddfd = np.inf #reset deficient df
#add interaction variables for original untransformed variables in best model so far
numx = bmodeleq.count('+') + 1
if numx == 1:
bmodeleq = ' + '.join(list(df0)).replace('+', '~', 1)
numx = bmodeleq.count('+') + 1
print('\nRestarting from best model (with', numx, 'Xs & Adjusted R² =', str(maxR2) + ') found so far...')
colname = bmodeleq.replace('~', '+').split(' + ')
df = df0[colname]
colname = colname[1:] #remove y or 'const'
x_test = x_test[colname]
# for i in range(numx):
# #look for 1st transformed variable:
# if colname[i][-5:] in trf:
# i = i - 1
# #colname[i] is the last untransformed x
# break
# #actually, nothing to do if i<=0
# print('\nAdding', int((i + 1) * i / 2), '2-way interactions among', i + 1,
# 'untransformed variables in best model found so far:')
# for j in range(i):
# #untransformed x in colname up to [i]
# for k in range(j + 1, i + 1):
# a = colname[j] + '_x_' + colname[k]
# print(a)
# df[a] = df[colname[j]] * df[colname[k]]
# x_test[a] = x_test[colname[j]] * x_test[colname[k]]
xpure = [x for x in colname if x[-5:] not in trf] #untransformed x names
i = len(xpure) - 1
#actually, nothing to do if i<=0
print('\nAdding', int((i + 1) * i / 2), '2-way interactions among', i + 1,
'untransformed variables in best model found so far:')
for j in range(i):
#untransformed x in colname up to [i]
for k in range(j + 1, i + 1):
a = xpure[j] + '_x_' + xpure[k]
print(a)
df[a] = df[xpure[j]] * df[xpure[k]]
x_test[a] = x_test[xpure[j]] * x_test[xpure[k]]
df0 = df.copy()
#delete collinear Xs introduced:
dl2 = delcorr(df)
dl2 = [x for x in dl2 if x.find('_x_') != -1] #only interaction variables kept
if len(dl2) > 0:
df0.drop(axis=1, columns=dl2, inplace=True) #collinear interaction variables deleted en masse, for real
x_test.drop(axis=1, columns=dl2, inplace=True)
#remaining Xs may be collinear
print('\n' + str(len(dl2)) + ' interaction variables deleted.')
#potential collinearity issues handled
#sort columns by absolute correlation with y, so may delete last column if regression fails:
#df0 = df0[df0.corrwith(y_train).sort_values(ascending=False, na_position='first', key=abs).index]
df0 = df0[[df0.columns[0]] + list(df0.iloc[:, 1:].corrwith(y_train)
.sort_values(ascending=False, na_position='first', key=abs).index)]
modeleq = ' + '.join(list(df0)).replace('+', '~', 1)
numx = df0.shape[1] - 1
if maxR2 == -np.inf:
bddf = np.inf
ddf5 = bddf
ddfd = bddf
p05 = False
x1x2 = True #interaction variables already included
#beyond-pairwise collinearity may still be introduced with the interaction variables
df = df0.copy() #ready for continuing deletion
continue
#identify X variable to delete by finding the one with smallest abs(t-stat):
t = out.tvalues[1:]
try:
xdrop = list(t[abs(t) == min(abs(t))].index)[-1]
except:
xdrop = list(t.index)[-1]
print('Variable to drop:', xdrop)
try:
df.drop(xdrop, axis=1, inplace=True)
except:
pass
modeleq = ' + '.join(list(df)).replace('+', '~', 1)
numx = numx - 1
#see if best model with all z-stat p-values < 0.05 is smaller than best model by adjusted R2:
if (R205 > -np.inf and len(modeleq05) < len(bmodeleq)) or (
R205 == -np.inf and (maxR2 == -np.inf or (len(modeleq05) > 0 and len(modeleq05) < len(bmodeleq)))):
bmodeleq = modeleq05
maxR2 = R205
bddf = ddf5
#see if model with smallest rank-deficiency is smaller than best model so far:
if (R2df > -np.inf and len(modeleqdf) < len(bmodeleq)) or (
R2df == -np.inf and (maxR2 == -np.inf or (len(modeleqdf) > 0 and len(modeleqdf) < len(bmodeleq)))):
bmodeleq = modeleqdf
maxR2 = R2df
bddf = ddfd
if maxR2 == -np.inf and out.fittedvalues.isna().any():
#some nan in y fit
print('\n*** Y variable', yname if m == 'Normal' else y.name, 'might not work with', m, 'distribution.')
else:
try:
if bddf >= ddf0 and ((maxR2 > -np.inf and R2df0 >= maxR2) or (maxR2 == -np.inf and bic0 <= bbic)):
bmodeleq_0 = bmodeleq
maxR2_0 = maxR2
bddf_0 = bddf
#prefer smaller rank deficiency
if m == 'Normal':
out = ols(modeleqdf0, df0).fit()
else:
out = sm.GLM(y, df0[modeleqdf0.replace('~', '+').split(' + ')], family=dist).fit()
if max(out.pvalues[1:]) < 0.05:
#Xs' p-values < 0.05
bmodeleq = modeleqdf0
maxR2 = R2df0
bddf = ddf0
except:
bmodeleq = bmodeleq_0
maxR2 = maxR2_0
bddf = bddf_0
try:
if m == 'Normal':
out = ols(bmodeleq, df0).fit()
#collinearity is still entirely possible at this stage
x_test = x_test[df0.columns[1:]]
else:
#out = sm.GLM(y, df1, family=dist).fit()
#x_test = sm.add_constant(x_test)[df1.columns]
df0 = df0[bmodeleq.replace('~', '+').split(' + ')]
out = sm.GLM(y, df0, family=dist).fit()
x_test = sm.add_constant(x_test)[df0.columns]
numx = bmodeleq.count('+') + 1
print('\nBest model has', numx, 'Xs (Adjusted R² =', str(maxR2), ', rank deficiency =', str(bddf) + '):\n')
print(out.summary2())
if m == 'Normal':
print()
if numx > 1:
print("Descending order of", numx, "X's significance, assuming", m, 'error distribution:')
#print('\n'.join(list(abs(out.tvalues[1:]).sort_values(0, ascending=False).index)))
print(pd.concat([pd.concat([out.params[:1], out.tvalues[:1]], 1), pd.concat([out.params[1:], out.tvalues[1:]], 1
).sort_values(1, key=abs, ascending=False)]).rename(columns={0:"Coefficient", 1:"z-stat"}))
#if the single best variable isn't high in above ranking, collinearity might be an issue
dfm = int(out.df_model)
print('\n' + 'Rank deficiency =', str(bddf) + ': Df Model (' + str(dfm) + ') is',
('less than' if bddf > 0 else 'same as'), 'number of Xs (' + str(numx) + ').')
import matplotlib.pyplot as pl
%matplotlib inline
pl.rcParams['lines.markersize'] = 2.5
pl.rcParams['lines.linewidth'] = 1
# if m == 'Normal':
#partial leverage plots, partial regression plots, added-variable plots
#https://r-bloggers.com/2021/03/partial-regression-plots-in-julia-python-and-r
#https://www.statsmodels.org/stable/generated/statsmodels.graphics.regressionplots.plot_partregress.html
#https://www.statsmodels.org/stable/generated/statsmodels.graphics.regressionplots.plot_partregress_grid.html
from statsmodels.graphics.regressionplots import plot_partregress_grid
import math
#includes intercept; 4 plots to a row:
nr = math.ceil((numx + 1) / 4) #number of rows of plots
pl.rcParams["figure.figsize"] = (20.2, 14 / 3 * nr) #plot height depends on number of rows of plots
print('\nPartial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:')
#grid(rows, columns) for plots; fixed at 4 columns of plots per row:
plot_partregress_grid(out, grid=(nr, 4))
pl.show()
y_fit = out.fittedvalues
y_train = df.iloc[:, 0] if m == 'Normal' else y
print('\n'+ m, 'fit using', ('' if m3 == '2' else 'reproducible ') + 'random 80% (x_train & y_train) of data rows:\n')
print(' Mean Absolute Residual =', abs(y_train - y_fit).mean())
print('Root Mean Squared Residual =', np.sqrt(((y_train - y_fit) ** 2.).mean()))
r2_train = y_train.corr(y_fit) ** 2.
print(' R² =', r2_train)
y_predict = out.predict(x_test) #forecast
print('\n' + m, 'prediction using remaining 20% (x_test & y_test) of data rows:\n')
print(' Mean Absolute Error =', abs(y_test - y_predict).mean())
print('Root Mean Squared Error =', np.sqrt(((y_test - y_predict) ** 2.).mean()))
r2_test = y_test.corr(y_predict) ** 2.
print(' R² =', r2_test)
print('\nPlots of train-set fit & test-set predict:')
#plot y_train vs y_fit
#https://stackoverflow.com/questions/42818361/how-to-make-two-plots-side-by-side-using-python
# pl.rcParams["figure.figsize"] = (4.04, 4.04)
pl.rcParams["figure.figsize"] = (20.2, 4.5)
# pl.rcParams['lines.markersize'] = 3
pl.subplot(1, 4, 1) #1 row, 4 columns, plot 1
pl.title(' y_train vs y_fit, R² = ' + str(round(r2_train, 3)))
pl.scatter(y_fit, y_train, s=3)
# pl.show()
#plot y_test vs y_predict
pl.subplot(1, 4, 2) #1 row, 4 columns, plot 2
pl.title(' y_test vs y_predict, R² = ' + str(round(r2_test, 3)))
# pl.scatter(y_predict, y_test, s=3);
pl.scatter(y_predict, y_test, s=3)
pl.show()
except:
print('\n*** Y variable', yname if m == 'Normal' else y.name, 'might not work with', m, 'distribution.')
========================================================== Next GLM Model ==============================================================
counter = 1
GLM Model Fitted = Normal
1 entered.
Assuming Normal error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:47 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
Normal fit including transformed Xs:
Adjusted R² = 0.9478169684655393 , max(X p-value) = 0.985954234936134 , rank deficiency = 36 , for 51 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.9405827032631453 , max(X p-value) = 0.66373541762269 , rank deficiency = 36 , for 50 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9449206872053191 , max(X p-value) = 0.6350800931130927 , rank deficiency = 36 , for 49 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.945412610405055 , max(X p-value) = 0.5625501947137284 , rank deficiency = 36 , for 48 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.9460985619275569 , max(X p-value) = 0.44068273057811835 , rank deficiency = 36 , for 47 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9463047117785048 , max(X p-value) = 0.12047347853599437 , rank deficiency = 36 , for 46 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9458981795317696 , max(X p-value) = 0.036160816482777476 , rank deficiency = 35 , for 45 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9458541569024241 , max(X p-value) = 0.00434626309597369 , rank deficiency = 34 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9446916344097764 , max(X p-value) = 0.013819673576195943 , rank deficiency = 33 , for 43 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9446782593994971 , max(X p-value) = 0.002169988412931058 , rank deficiency = 32 , for 42 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.943270258057936 , max(X p-value) = 0.001657086595347861 , rank deficiency = 32 , for 41 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9413909519985548 , max(X p-value) = 0.019192324630964742 , rank deficiency = 31 , for 40 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9402125124635764 , max(X p-value) = 0.5383572875703694 , rank deficiency = 31 , for 39 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.9402218848359302 , max(X p-value) = 0.030611062954542148 , rank deficiency = 30 , for 38 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9402359005513438 , max(X p-value) = 3.008474636643294e-06 , rank deficiency = 29 , for 37 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = 0.9366206385102381 , max(X p-value) = 2.033116147578988e-17 , rank deficiency = 29 , for 36 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.9366165884266224 , max(X p-value) = 1.915503472417556e-18 , rank deficiency = 28 , for 35 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9216335257590981 , max(X p-value) = 6.875739568743246e-11 , rank deficiency = 28 , for 34 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.9115937418590563 , max(X p-value) = 5.147013825207387e-21 , rank deficiency = 28 , for 33 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.9927697252000652 , max(X p-value) = 0.7204203630389256 , rank deficiency = 19 , for 32 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9931380275772231 , max(X p-value) = 0.8204399941897406 , rank deficiency = 18 , for 31 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9931159946747666 , max(X p-value) = 0.7760742197335619 , rank deficiency = 18 , for 30 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.9932126618747562 , max(X p-value) = 0.9888613777837427 , rank deficiency = 18 , for 29 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9932330127081226 , max(X p-value) = 0.9741794189239255 , rank deficiency = 17 , for 28 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9924005797407535 , max(X p-value) = 0.7460978702217127 , rank deficiency = 16 , for 27 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.993329535996961 , max(X p-value) = 0.6035926731095812 , rank deficiency = 16 , for 26 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9933387650872868 , max(X p-value) = 0.14398956469623952 , rank deficiency = 16 , for 25 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9934605156544192 , max(X p-value) = 0.3820228006915348 , rank deficiency = 16 , for 24 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9929343040310185 , max(X p-value) = 0.6648212615623568 , rank deficiency = 16 , for 23 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9934190055744618 , max(X p-value) = 0.6902162438139163 , rank deficiency = 15 , for 22 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9934831128429907 , max(X p-value) = 0.8638430454200229 , rank deficiency = 14 , for 21 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9934943454953131 , max(X p-value) = 0.5338233455915548 , rank deficiency = 13 , for 20 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9934811904207127 , max(X p-value) = 0.32585069696809665 , rank deficiency = 12 , for 19 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9932161845767776 , max(X p-value) = 0.8174400458926784 , rank deficiency = 11 , for 18 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9932347672545414 , max(X p-value) = 0.25294991311910264 , rank deficiency = 11 , for 17 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9932402807763137 , max(X p-value) = 0.00034782495726692897 , rank deficiency = 11 , for 16 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9930776121862502 , max(X p-value) = 0.0211799524042513 , rank deficiency = 10 , for 15 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9933930706265583 , max(X p-value) = 0.5284879501741218 , rank deficiency = 0 , for 14 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9934048704581147 , max(X p-value) = 0.5843767842628067 , rank deficiency = 0 , for 13 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.993418533115842 , max(X p-value) = 0.5203788233327193 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9934299097865394 , max(X p-value) = 0.724109308600305 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9934468220450595 , max(X p-value) = 0.1258699008334402 , rank deficiency = 0 , for 10 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9934207962140928 , max(X p-value) = 0.1311952036120294 , rank deficiency = 0 , for 9 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.9933959934558395 , max(X p-value) = 0.36900897193371773 , rank deficiency = 0 , for 8 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9933996683032327 , max(X p-value) = 0.09583175115830217 , rank deficiency = 0 , for 7 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9933653446973529 , max(X p-value) = 0.07126909932971441 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9933216183820517 , max(X p-value) = 0.1061665713594618 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9932902707374486 , max(X p-value) = 0.015764633163752196 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9931957761886889 , max(X p-value) = 1.1479826891071416e-09 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9924495478708683 , max(X p-value) = 0.6347129256625307 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9924662903673914 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Restarting from best model (with 4 Xs & Adjusted R² = 0.9932902707374486) found so far...
Adding 0 2-way interactions among 1 untransformed variables in best model found so far:
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.9932902707374486 , max(X p-value) = 0.015764633163752196 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9931957761886889 , max(X p-value) = 1.1479826891071416e-09 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9924495478708683 , max(X p-value) = 0.6347129256625307 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9924662903673914 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Best model has 4 Xs (Adjusted R² = 0.9932902707374486 , rank deficiency = 0):
Results: Ordinary least squares
=======================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: SPY_Close_Price AIC: 1836.4673
Date: 2021-04-22 13:47 BIC: 1855.7712
No. Observations: 351 Log-Likelihood: -913.23
Df Model: 4 F-statistic: 1.295e+04
Df Residuals: 346 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 10.806
-----------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
-----------------------------------------------------------------------
Intercept -4.2118 5.2625 -0.8003 0.4241 -14.5624 6.1388
SPY_Open_Price 1.0070 0.0061 166.2750 0.0000 0.9951 1.0189
FTSE_Low_Price_sqrt -3.4388 0.5173 -6.6478 0.0000 -4.4563 -2.4214
FTSE_Close_Price_sqrt 3.4778 0.5255 6.6178 0.0000 2.4442 4.5114
FTSE_Volume_sqrt -0.0001 0.0000 -2.4263 0.0158 -0.0002 -0.0000
-----------------------------------------------------------------------
Omnibus: 25.063 Durbin-Watson: 2.018
Prob(Omnibus): 0.000 Jarque-Bera (JB): 34.477
Skew: -0.533 Prob(JB): 0.000
Kurtosis: 4.105 Condition No.: 911529
=======================================================================
* The condition number is large (9e+05). This might indicate
strong multicollinearity or other numerical problems.
Descending order of 4 X's significance, assuming Normal error distribution:
Coefficient z-stat
Intercept -4.211777 -0.800332
SPY_Open_Price 1.006991 166.274951
FTSE_Low_Price_sqrt -3.438845 -6.647758
FTSE_Close_Price_sqrt 3.477797 6.617810
FTSE_Volume_sqrt -0.000085 -2.426288
Rank deficiency = 0: Df Model (4) is same as number of Xs (4).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Normal fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 2.437777470045315
Root Mean Squared Residual = 3.2637168743878178
R² = 0.9933669533575931
Normal prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 2.278512896432442
Root Mean Squared Error = 3.282785931543249
R² = 0.9922985759558204
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 2
GLM Model Fitted = Poisson
1 entered.
Assuming Poisson error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:47 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
Poisson fit including transformed Xs:
Adjusted R² = 0.26337472323779754 , max(X p-value) = 4.510877789586801e-11 , rank deficiency = 38 , for 51 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -0.030816358273417332 , max(X p-value) = 0.44261716461280953 , rank deficiency = 29 , for 50 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.7821873303696341 , rank deficiency = 27 , for 49 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -inf , max(X p-value) = 0.8898065744382733 , rank deficiency = 27 , for 48 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.9809144015523092 , rank deficiency = 26 , for 47 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.6996124470605272 , rank deficiency = 26 , for 46 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9050222735872183 , rank deficiency = 26 , for 45 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.7729701428060125 , rank deficiency = 25 , for 44 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.060582237010261886 , max(X p-value) = 0.9438973428476511 , rank deficiency = 25 , for 43 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9542125101697058 , rank deficiency = 24 , for 42 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.8684802190715313 , rank deficiency = 23 , for 41 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9561444739911025 , rank deficiency = 22 , for 40 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -inf , max(X p-value) = 0.8341322886314934 , rank deficiency = 21 , for 39 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.5894064504659111 , rank deficiency = 21 , for 38 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.983894135470667 , max(X p-value) = 0.9630060886265759 , rank deficiency = 10 , for 37 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9839053891916058 , max(X p-value) = 0.9046215231596222 , rank deficiency = 9 , for 36 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.983993952751736 , max(X p-value) = 0.8945834855522539 , rank deficiency = 5 , for 35 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9839841998347815 , max(X p-value) = 0.8755618734852905 , rank deficiency = 4 , for 34 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.9839781634609812 , max(X p-value) = 0.7560363775780456 , rank deficiency = 3 , for 33 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9839967946843232 , max(X p-value) = 0.7697437356398072 , rank deficiency = 3 , for 32 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9840011514746106 , max(X p-value) = 0.8581634403169175 , rank deficiency = 3 , for 31 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9840290217591802 , max(X p-value) = 0.9196646967056145 , rank deficiency = 3 , for 30 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.9840721890744782 , max(X p-value) = 0.8424818442581268 , rank deficiency = 3 , for 29 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9840996982465094 , max(X p-value) = 0.8200745502526866 , rank deficiency = 3 , for 28 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9841105861181442 , max(X p-value) = 0.7352624947916921 , rank deficiency = 3 , for 27 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9840941646264524 , max(X p-value) = 0.6936586271227163 , rank deficiency = 3 , for 26 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9839934807329299 , max(X p-value) = 0.6616182970263955 , rank deficiency = 2 , for 25 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.983860735329951 , max(X p-value) = 0.6938863431040673 , rank deficiency = 1 , for 24 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = 0.9838031676260994 , max(X p-value) = 0.7161834997680584 , rank deficiency = 1 , for 23 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9837952084819037 , max(X p-value) = 0.5801867054382803 , rank deficiency = 1 , for 22 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9836773670884676 , max(X p-value) = 0.6307107532252838 , rank deficiency = 1 , for 21 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9836873224165844 , max(X p-value) = 0.4887290698945196 , rank deficiency = 1 , for 20 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9834495389709071 , max(X p-value) = 0.2810630344466837 , rank deficiency = 1 , for 19 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9829888215283717 , max(X p-value) = 0.3254299148032558 , rank deficiency = 1 , for 18 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9825253297036338 , max(X p-value) = 0.46379931979762645 , rank deficiency = 1 , for 17 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9821602151850062 , max(X p-value) = 0.33856218846538777 , rank deficiency = 1 , for 16 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9818126125393816 , max(X p-value) = 0.06291141778970835 , rank deficiency = 1 , for 15 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9798595546818034 , max(X p-value) = 0.0926956227464959 , rank deficiency = 1 , for 14 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.979648522140971 , max(X p-value) = 0.9319497473919178 , rank deficiency = 1 , for 13 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9796413153064171 , max(X p-value) = 0.004764051906131865 , rank deficiency = 0 , for 12 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9752992862734469 , max(X p-value) = 0.13108018749321337 , rank deficiency = 0 , for 11 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9743410168184665 , max(X p-value) = 0.14050457874493863 , rank deficiency = 0 , for 10 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9734505065569959 , max(X p-value) = 0.2013605347819356 , rank deficiency = 0 , for 9 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9723248211972364 , max(X p-value) = 0.866505319085078 , rank deficiency = 0 , for 8 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9723856267016588 , max(X p-value) = 0.09013003973582276 , rank deficiency = 0 , for 7 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9706886043188151 , max(X p-value) = 0.14163308335565555 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9698517190116132 , max(X p-value) = 1.0988221356482437e-08 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9486728919910477 , max(X p-value) = 0.008660485096731504 , rank deficiency = 0 , for 4 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9458653329362319 , max(X p-value) = 1.759051596243539e-16 , rank deficiency = 0 , for 3 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9087585822550082 , max(X p-value) = 1.1935570524740047e-65 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.7629361670170408 , max(X p-value) = 3.3654555975721586e-280 , rank deficiency = 0 , for 1 Xs.
Variable left: US_Covid_Cases_sqrt
Restarting from best model (with 12 Xs & Adjusted R² = 0.9796413153064171) found so far...
Adding 10 2-way interactions among 5 untransformed variables in best model found so far:
US_Covid_Cases_x_cumVirusTests
US_Covid_Cases_x_cumAdmissions
US_Covid_Cases_x_cumDailyNsoDeathsByDeathDate
US_Covid_Cases_x_cumPeopleVaccinatedCompleteByPublishDate
cumVirusTests_x_cumAdmissions
cumVirusTests_x_cumDailyNsoDeathsByDeathDate
cumVirusTests_x_cumPeopleVaccinatedCompleteByPublishDate
cumAdmissions_x_cumDailyNsoDeathsByDeathDate
cumAdmissions_x_cumPeopleVaccinatedCompleteByPublishDate
cumDailyNsoDeathsByDeathDate_x_cumPeopleVaccinatedCompleteByPublishDate
X pairs with correlations > 0.995 :
cumAdmissions_x_cumPeopleVaccinatedCompleteByPublishDate , cumDailyNsoDeathsByDeathDate_x_cumPeopleVaccinatedCompleteByPublishDate
US_Covid_Cases_x_cumVirusTests , cumVirusTests_x_cumAdmissions
cumVirusTests_x_cumAdmissions , cumVirusTests_x_cumDailyNsoDeathsByDeathDate
US_Covid_Cases_x_cumAdmissions , cumAdmissions_x_cumDailyNsoDeathsByDeathDate
4 variables considered for deletion:
cumDailyNsoDeathsByDeathDate_x_cumPeopleVaccinatedCompleteByPublishDate
US_Covid_Cases_x_cumVirusTests
cumVirusTests_x_cumDailyNsoDeathsByDeathDate
cumAdmissions_x_cumDailyNsoDeathsByDeathDate
X pairs with correlations > 0.995 :
cumAdmissions_x_cumPeopleVaccinatedCompleteByPublishDate , US_Covid_Cases_x_cumPeopleVaccinatedCompleteByPublishDate
1 variables considered for deletion:
US_Covid_Cases_x_cumPeopleVaccinatedCompleteByPublishDate
X pairs with correlations > 0.995 :
(no more)
5 interaction variables deleted.
Adjusted R² = 0.8414026087857724 , max(X p-value) = 0.6979102436680902 , rank deficiency = 4 , for 17 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.8418357875182605 , max(X p-value) = 0.5756444402236363 , rank deficiency = 4 , for 16 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.8412897130908661 , max(X p-value) = 0.08228635453211393 , rank deficiency = 3 , for 15 Xs.
Variable to drop: cumVirusTests_x_cumAdmissions
Adjusted R² = 0.8402355625193314 , max(X p-value) = 1.9944904037569952e-07 , rank deficiency = 2 , for 14 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.837048171825336 , max(X p-value) = 0.016572641566935248 , rank deficiency = 2 , for 13 Xs.
Variable to drop: US_Covid_Cases_x_cumAdmissions
Adjusted R² = 0.8347295268011248 , max(X p-value) = 5.837905327661224e-06 , rank deficiency = 2 , for 12 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.8333205481485273 , max(X p-value) = 0.010337233771880883 , rank deficiency = 2 , for 11 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.8292879420008757 , max(X p-value) = 0.04020906465614028 , rank deficiency = 1 , for 10 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.8306107130334409 , max(X p-value) = 0.0005124152348061409 , rank deficiency = 1 , for 9 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.827261169895052 , max(X p-value) = 2.4643234223656836e-18 , rank deficiency = 1 , for 8 Xs.
Variable to drop: cumVirusTests_x_cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9607319333597728 , max(X p-value) = 0.48055298238404787 , rank deficiency = 1 , for 7 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9605229882605885 , max(X p-value) = 0.30027292754370005 , rank deficiency = 1 , for 6 Xs.
Variable to drop: US_Covid_Cases_x_cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9602248087860206 , max(X p-value) = 4.6370394744708553e-10 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9371880309057168 , max(X p-value) = 5.769395681782629e-37 , rank deficiency = 0 , for 4 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.8434874555988491 , max(X p-value) = 0.00011347812390080424 , rank deficiency = 0 , for 3 Xs.
Variable to drop: cumAdmissions_x_cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.8359956829012313 , max(X p-value) = 1.858753197013615e-70 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.6755723932122122 , max(X p-value) = 2.597262777167026e-250 , rank deficiency = 0 , for 1 Xs.
Variable left: cumDailyNsoDeathsByDeathDate
Best model has 12 Xs (Adjusted R² = 0.9796413153064171 , rank deficiency = 0):
Results: Generalized linear model
=============================================================================================
Model: GLM AIC: 2743.4099
Link Function: log BIC: -1945.6431
Dependent Variable: SPY_Close_Price Log-Likelihood: -1358.7
Date: 2021-04-22 13:47 LL-Null: -2202.9
No. Observations: 351 Deviance: 35.303
Df Model: 12 Pearson chi2: 35.4
Df Residuals: 338 Scale: 1.0000
Method: IRLS
---------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------
const 4.0824 0.1603 25.4666 0.0000 3.7682 4.3966
US_Covid_Cases_sqrt 0.0003 0.0000 5.9919 0.0000 0.0002 0.0003
US_Covid_Cases -0.0000 0.0000 -3.0613 0.0022 -0.0000 -0.0000
cumVirusTests 0.0000 0.0000 2.9091 0.0036 0.0000 0.0000
cumAdmissions -0.0000 0.0000 -3.9400 0.0001 -0.0000 -0.0000
cumDailyNsoDeathsByDeathDate 0.0000 0.0000 3.8895 0.0001 0.0000 0.0000
cumDailyNsoDeathsByDeathDate_sqar -0.0000 0.0000 -2.8226 0.0048 -0.0000 -0.0000
cumAdmissions_sqrt 0.0032 0.0010 3.1250 0.0018 0.0012 0.0052
cumAdmissions_sqar 0.0000 0.0000 3.4461 0.0006 0.0000 0.0000
cumDailyNsoDeathsByDeathDate_sqrt -0.0041 0.0012 -3.5820 0.0003 -0.0064 -0.0019
cumPeopleVaccinatedCompleteByPublishDate_sqrt -0.0006 0.0002 -3.3239 0.0009 -0.0009 -0.0002
FTSE_Close_Price_sqrt 0.0197 0.0018 11.0474 0.0000 0.0162 0.0232
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 2.9684 0.0030 0.0000 0.0000
=============================================================================================
Descending order of 12 X's significance, assuming Poisson error distribution:
Coefficient z-stat
const 4.082369e+00 25.466637
FTSE_Close_Price_sqrt 1.972912e-02 11.047435
US_Covid_Cases_sqrt 2.516257e-04 5.991869
cumAdmissions -1.093186e-05 -3.939988
cumDailyNsoDeathsByDeathDate 2.236945e-05 3.889547
cumDailyNsoDeathsByDeathDate_sqrt -4.143900e-03 -3.582035
cumAdmissions_sqar 1.355892e-11 3.446093
cumPeopleVaccinatedCompleteByPublishDate_sqrt -5.806812e-04 -3.323872
cumAdmissions_sqrt 3.199659e-03 3.125012
US_Covid_Cases -3.226304e-08 -3.061288
cumPeopleVaccinatedCompleteByPublishDate 1.373353e-07 2.968448
cumVirusTests 6.007922e-09 2.909134
cumDailyNsoDeathsByDeathDate_sqar -8.866150e-11 -2.822569
Rank deficiency = 0: Df Model (12) is same as number of Xs (12).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Poisson fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 4.2709955736092295
Root Mean Squared Residual = 5.619089179801603
R² = 0.9803393273530542
Poisson prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 4.5821337762598064
Root Mean Squared Error = 6.019701414661133
R² = 0.972949613814438
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 3
GLM Model Fitted = Logit
1 entered.
Assuming Logit error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:47 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
Logit fit including transformed Xs:
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 38 , for 51 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 50 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 49 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 48 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 28 , for 47 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 27 , for 46 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 26 , for 45 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 12 , for 44 Xs.
Variable to drop: Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 12 , for 43 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 11 , for 42 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 10 , for 41 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 10 , for 40 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 10 , for 39 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 10 , for 38 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 9 , for 37 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 9 , for 36 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 8 , for 35 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 7 , for 34 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 7 , for 33 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 7 , for 32 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 7 , for 31 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 30 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 29 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 28 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 27 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 26 Xs.
Variable to drop: cumVirusTests
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 25 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 5 , for 24 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 4 , for 23 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 22 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 21 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 20 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 19 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 18 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 17 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 16 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 15 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 14 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 13 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 12 Xs.
Variable to drop: cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 10 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 9 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 8 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 7 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 6 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Restarting from best model (with 14 Xs & Adjusted R² = -inf) found so far...
Adding 15 2-way interactions among 6 untransformed variables in best model found so far:
SPY_Open_Price_x_US_Covid_Deaths
SPY_Open_Price_x_US_Covid_Cases
SPY_Open_Price_x_Unnamed__0
SPY_Open_Price_x_cumAdmissions
SPY_Open_Price_x_cumDailyNsoDeathsByDeathDate
US_Covid_Deaths_x_US_Covid_Cases
US_Covid_Deaths_x_Unnamed__0
US_Covid_Deaths_x_cumAdmissions
US_Covid_Deaths_x_cumDailyNsoDeathsByDeathDate
US_Covid_Cases_x_Unnamed__0
US_Covid_Cases_x_cumAdmissions
US_Covid_Cases_x_cumDailyNsoDeathsByDeathDate
Unnamed__0_x_cumAdmissions
Unnamed__0_x_cumDailyNsoDeathsByDeathDate
cumAdmissions_x_cumDailyNsoDeathsByDeathDate
X pairs with correlations > 0.995 :
US_Covid_Cases_x_cumAdmissions , cumAdmissions_x_cumDailyNsoDeathsByDeathDate
US_Covid_Cases_x_cumDailyNsoDeathsByDeathDate , US_Covid_Deaths_x_US_Covid_Cases
US_Covid_Deaths_x_US_Covid_Cases , US_Covid_Deaths_x_cumAdmissions
US_Covid_Deaths_x_cumDailyNsoDeathsByDeathDate , US_Covid_Deaths_sqar
SPY_Open_Price_x_US_Covid_Deaths , US_Covid_Deaths
5 variables considered for deletion:
cumAdmissions_x_cumDailyNsoDeathsByDeathDate
US_Covid_Cases_x_cumDailyNsoDeathsByDeathDate
US_Covid_Deaths_x_US_Covid_Cases
US_Covid_Deaths_x_cumDailyNsoDeathsByDeathDate
SPY_Open_Price_x_US_Covid_Deaths
X pairs with correlations > 0.995 :
(no more)
5 interaction variables deleted.
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 24 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 23 Xs.
Variable to drop: cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 22 Xs.
Variable to drop: Unnamed__0_x_cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 21 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 20 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 19 Xs.
Variable to drop: SPY_Open_Price_x_Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 3 , for 18 Xs.
Variable to drop: US_Covid_Cases_x_cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 17 Xs.
Variable to drop: US_Covid_Cases_x_Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 16 Xs.
Variable to drop: Unnamed__0_x_cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 15 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 14 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 13 Xs.
Variable to drop: US_Covid_Deaths_x_Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 11 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 10 Xs.
Variable to drop: SPY_Open_Price_x_cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 9 Xs.
Variable to drop: SPY_Open_Price_x_cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 8 Xs.
Variable to drop: SPY_Open_Price_x_US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 7 Xs.
Variable to drop: US_Covid_Deaths_x_cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 6 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Best model has 16 Xs (Adjusted R² = -inf , rank deficiency = 0):
Results: Generalized linear model
===============================================================================================================================================================
Model: GLM AIC: nan
Link Function: logit BIC: 9781673.5602
Dependent Variable: SPY_Close_Price Log-Likelihood: nan
Date: 2021-04-22 13:47 LL-Null: nan
No. Observations: 351 Deviance: 9.7836e+06
Df Model: 16 Pearson chi2: 1.78e+23
Df Residuals: 334 Scale: 1.0000
Method: IRLS
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------------------------------------------------------
const -119927381986130432.0000 262444815.7646 -456962282.2868 0.0000 -119927382500512816.0000 -119927381471748048.0000
SPY_Open_Price 5561618035927426.0000 860419.2110 6463846883.7903 0.0000 5561618034241035.0000 5561618037613817.0000
US_Covid_Deaths -7524341834170.2598 4238.5143 -1775230976.0147 0.0000 -7524341842477.5947 -7524341825862.9248
SPY_Open_Price_x_US_Covid_Cases 56126533.6484 0.0358 1565817562.2612 0.0000 56126533.5782 56126533.7187
US_Covid_Deaths_x_Unnamed__0 7955067486.6382 9.3299 852640120.4891 0.0000 7955067468.3519 7955067504.9245
SPY_Open_Price_x_cumAdmissions -16529099318.2303 11.0559 -1495044859.2861 0.0000 -16529099339.8996 -16529099296.5611
SPY_Open_Price_x_cumDailyNsoDeathsByDeathDate 40773507958.6309 26.0418 1565697001.3373 0.0000 40773507907.5899 40773508009.6718
Unnamed__0_x_cumDailyNsoDeathsByDeathDate -14341552106.2374 25.5306 -561739808.8526 0.0000 -14341552156.2765 -14341552056.1984
US_Covid_Deaths_sqrt 5092919363121624.0000 2729721.2759 1865728713.0760 0.0000 5092919357771469.0000 5092919368471779.0000
US_Covid_Deaths_sqar -14006791.9983 0.0185 -758971980.3394 0.0000 -14006792.0345 -14006791.9621
US_Covid_Deaths_x_cumAdmissions 29724174.0657 0.0375 791946798.4171 0.0000 29724173.9921 29724174.1392
cumAdmissions_sqrt 2378754336492902.0000 1499961.7029 1585876713.9982 0.0000 2378754333553031.0000 2378754339432773.0000
cumAdmissions_sqar -11064882.0097 0.0181 -610213056.2664 0.0000 -11064882.0453 -11064881.9742
Unnamed__0_sqrt -17276949013461120.0000 13306203.0504 -1298413149.7143 0.0000 -17276949039540798.0000 -17276948987381442.0000
cumDailyNsoDeathsByDeathDate_sqrt -7129710764829622.0000 3509918.5162 -2031303784.3795 0.0000 -7129710771708936.0000 -7129710757950308.0000
FTSE_Low_Price_sqrt -22314716146290464.0000 11249600.2160 -1983600814.0562 0.0000 -22314716168339276.0000 -22314716124241652.0000
FTSE_Close_Price_sqrt 28121773357352944.0000 11224769.0981 2505332012.7575 0.0000 28121773335352800.0000 28121773379353088.0000
===============================================================================================================================================================
Descending order of 16 X's significance, assuming Logit error distribution:
Coefficient z-stat
const -1.199274e+17 -4.569623e+08
SPY_Open_Price 5.561618e+15 6.463847e+09
FTSE_Close_Price_sqrt 2.812177e+16 2.505332e+09
cumDailyNsoDeathsByDeathDate_sqrt -7.129711e+15 -2.031304e+09
FTSE_Low_Price_sqrt -2.231472e+16 -1.983601e+09
US_Covid_Deaths_sqrt 5.092919e+15 1.865729e+09
US_Covid_Deaths -7.524342e+12 -1.775231e+09
cumAdmissions_sqrt 2.378754e+15 1.585877e+09
SPY_Open_Price_x_US_Covid_Cases 5.612653e+07 1.565818e+09
SPY_Open_Price_x_cumDailyNsoDeathsByDeathDate 4.077351e+10 1.565697e+09
SPY_Open_Price_x_cumAdmissions -1.652910e+10 -1.495045e+09
Unnamed__0_sqrt -1.727695e+16 -1.298413e+09
US_Covid_Deaths_x_Unnamed__0 7.955067e+09 8.526401e+08
US_Covid_Deaths_x_cumAdmissions 2.972417e+07 7.919468e+08
US_Covid_Deaths_sqar -1.400679e+07 -7.589720e+08
cumAdmissions_sqar -1.106488e+07 -6.102131e+08
Unnamed__0_x_cumDailyNsoDeathsByDeathDate -1.434155e+10 -5.617398e+08
Rank deficiency = 0: Df Model (16) is same as number of Xs (16).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Logit fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 332.78726633903125
Root Mean Squared Residual = 335.19134870903906
R² = nan
Logit prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 336.8010238522727
Root Mean Squared Error = 338.73646660574445
R² = nan
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 4
GLM Model Fitted = Probit
1 entered.
Assuming Probit error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:47 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
Probit fit including transformed Xs:
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 38 , for 51 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 38 , for 50 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 38 , for 49 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 38 , for 48 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 37 , for 47 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 37 , for 46 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 37 , for 45 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 36 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 35 , for 43 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 35 , for 42 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 34 , for 41 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 33 , for 40 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 32 , for 39 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 31 , for 38 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 31 , for 37 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 30 , for 36 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 30 , for 35 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 30 , for 34 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 29 , for 33 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 24 , for 32 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 23 , for 31 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 22 , for 30 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 21 , for 29 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 20 , for 28 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 19 , for 27 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 18 , for 26 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 17 , for 25 Xs.
Variable to drop: Sun_Hours
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 16 , for 24 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 16 , for 23 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 15 , for 22 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 15 , for 21 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 15 , for 20 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 14 , for 19 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 13 , for 18 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 12 , for 17 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 11 , for 16 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 10 , for 15 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 9 , for 14 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 8 , for 13 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 7 , for 12 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 6 , for 11 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 5 , for 10 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 4 , for 9 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 4 , for 8 Xs.
Variable to drop: cumAdmissions
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 4 , for 7 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 4 , for 6 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: cumVirusTests
Restarting from best model (with 5 Xs & Adjusted R² = -inf) found so far...
Adding 3 2-way interactions among 3 untransformed variables in best model found so far:
cumVirusTests_x_FTSE_High_Price
cumVirusTests_x_Max_Temperature_DegC
FTSE_High_Price_x_Max_Temperature_DegC
X pairs with correlations > 0.995 :
cumVirusTests_x_Max_Temperature_DegC , cumVirusTests
1 variables considered for deletion:
cumVirusTests_x_Max_Temperature_DegC
X pairs with correlations > 0.995 :
(no more)
1 interaction variables deleted.
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 2 , for 7 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 1 , for 6 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 5 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 4 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 3 Xs.
Variable to drop: cumVirusTests_x_FTSE_High_Price
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_High_Price_x_Max_Temperature_DegC
Adjusted R² = -inf , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: cumVirusTests
Best model has 5 Xs (Adjusted R² = -inf , rank deficiency = 0):
Results: Generalized linear model
====================================================================================================================================================
Model: GLM AIC: nan
Link Function: probit BIC: 9781609.0916
Dependent Variable: SPY_Close_Price Log-Likelihood: nan
Date: 2021-04-22 13:47 LL-Null: nan
No. Observations: 351 Deviance: 9.7836e+06
Df Model: 5 Pearson chi2: 1.78e+23
Df Residuals: 345 Scale: 1.0000
Method: IRLS
----------------------------------------------------------------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------------------------------------
const 224817196444027008.0000 7490562.3975 30013393456.0552 0.0000 224817196429345760.0000 224817196458708256.0000
cumVirusTests 10698254939.5785 0.4272 25042097170.6195 0.0000 10698254938.7412 10698254940.4159
cumVirusTests_x_FTSE_High_Price -1486858.5600 0.0001 -23339289589.8988 0.0000 -1486858.5601 -1486858.5598
FTSE_High_Price_x_Max_Temperature_DegC 2169188191465.9165 72.4998 29919916083.2899 0.0000 2169188191323.8196 2169188191608.0134
Sun_Hours_sqrt -1705676287014696.0000 1006477.9119 -1694698181.5146 0.0000 -1705676288987356.5000 -1705676285042035.5000
Max_Temperature_DegC -10910885028242008.0000 892966.8686 -12218689641.8882 0.0000 -10910885029992190.0000 -10910885026491826.0000
====================================================================================================================================================
Descending order of 5 X's significance, assuming Probit error distribution:
Coefficient z-stat
const 2.248172e+17 3.001339e+10
FTSE_High_Price_x_Max_Temperature_DegC 2.169188e+12 2.991992e+10
cumVirusTests 1.069825e+10 2.504210e+10
cumVirusTests_x_FTSE_High_Price -1.486859e+06 -2.333929e+10
Max_Temperature_DegC -1.091089e+16 -1.221869e+10
Sun_Hours_sqrt -1.705676e+15 -1.694698e+09
Rank deficiency = 0: Df Model (5) is same as number of Xs (5).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Probit fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 332.78726633903125
Root Mean Squared Residual = 335.19134870903906
R² = nan
Probit prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 336.8010238522727
Root Mean Squared Error = 338.73646660574445
R² = nan
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 5
GLM Model Fitted = Gamma
1 entered.
Assuming Gamma error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:47 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
Gamma fit including transformed Xs:
Adjusted R² = -0.038095089645062696 , max(X p-value) = 0.41526594353651547 , rank deficiency = 38 , for 51 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = -0.029914719010490032 , max(X p-value) = 0.9914294112282587 , rank deficiency = 37 , for 50 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.6738995883251777 , max(X p-value) = 0.008123221671038001 , rank deficiency = 36 , for 49 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.6737233578612362 , max(X p-value) = 2.2415787382879906e-05 , rank deficiency = 35 , for 48 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.6774595964630861 , max(X p-value) = 7.1979579619872e-05 , rank deficiency = 35 , for 47 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.9745466372106943 , max(X p-value) = 0.9914837739439523 , rank deficiency = 28 , for 46 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9748051682566703 , max(X p-value) = 0.804182873304361 , rank deficiency = 28 , for 45 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9768560490882827 , max(X p-value) = 0.7671972644138509 , rank deficiency = 28 , for 44 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9812330389683493 , max(X p-value) = 0.960121040753244 , rank deficiency = 27 , for 43 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9815562718080507 , max(X p-value) = 0.9922203604814641 , rank deficiency = 26 , for 42 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9817654117355576 , max(X p-value) = 0.947380073520984 , rank deficiency = 25 , for 41 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9815193235301343 , max(X p-value) = 0.9319924646332649 , rank deficiency = 25 , for 40 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9822425366146693 , max(X p-value) = 0.8319838644864784 , rank deficiency = 25 , for 39 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.981499235701325 , max(X p-value) = 0.8797386820116424 , rank deficiency = 25 , for 38 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9819872514065774 , max(X p-value) = 0.6091973237550996 , rank deficiency = 25 , for 37 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9818545176982307 , max(X p-value) = 0.9691297790238252 , rank deficiency = 24 , for 36 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9805211845177628 , max(X p-value) = 0.8733833689009399 , rank deficiency = 23 , for 35 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.980759296774929 , max(X p-value) = 0.5576208045712543 , rank deficiency = 22 , for 34 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.9795648178884052 , max(X p-value) = 0.987526897899163 , rank deficiency = 21 , for 33 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.9823413753129854 , max(X p-value) = 0.8881091673850936 , rank deficiency = 21 , for 32 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9825432341083821 , max(X p-value) = 0.7156261695071593 , rank deficiency = 20 , for 31 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9826490960420656 , max(X p-value) = 0.667987535890656 , rank deficiency = 19 , for 30 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9822770283673431 , max(X p-value) = 0.30201549649544457 , rank deficiency = 18 , for 29 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9808098226475256 , max(X p-value) = 0.31663913633790863 , rank deficiency = 17 , for 28 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9807059617680441 , max(X p-value) = 0.10575555372038138 , rank deficiency = 17 , for 27 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9803087092740599 , max(X p-value) = 0.05203334963436798 , rank deficiency = 16 , for 26 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9809908674538582 , max(X p-value) = 0.01850669614315132 , rank deficiency = 15 , for 25 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9810075260525963 , max(X p-value) = 0.30600034082255256 , rank deficiency = 15 , for 24 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9822890953445761 , max(X p-value) = 0.7868907950700005 , rank deficiency = 15 , for 23 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.9825428346450511 , max(X p-value) = 0.6814075557028385 , rank deficiency = 14 , for 22 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9825949615572719 , max(X p-value) = 0.5833594126205673 , rank deficiency = 13 , for 21 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9824625791209525 , max(X p-value) = 0.48566261637845154 , rank deficiency = 12 , for 20 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.982391970876151 , max(X p-value) = 0.042379051730815756 , rank deficiency = 11 , for 19 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.9822789695440493 , max(X p-value) = 0.9007812460034228 , rank deficiency = 10 , for 18 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9824142972037274 , max(X p-value) = 0.02694950132703842 , rank deficiency = 9 , for 17 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.989914124202607 , max(X p-value) = 0.8082425453401831 , rank deficiency = 4 , for 16 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9899413451772844 , max(X p-value) = 0.7509237179885084 , rank deficiency = 4 , for 15 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9899858976851106 , max(X p-value) = 0.6699624041686318 , rank deficiency = 4 , for 14 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.9899077610712339 , max(X p-value) = 0.1884923866724768 , rank deficiency = 1 , for 13 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9898563138438617 , max(X p-value) = 0.0365776207618209 , rank deficiency = 1 , for 12 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9897979517583695 , max(X p-value) = 5.001389210151439e-05 , rank deficiency = 1 , for 11 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.990512010050569 , max(X p-value) = 0.005714340342114575 , rank deficiency = 1 , for 10 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9902965093954104 , max(X p-value) = 0.026096274702054948 , rank deficiency = 1 , for 9 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.9903021034119892 , max(X p-value) = 0.02438586005050493 , rank deficiency = 1 , for 8 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.9901159328661556 , max(X p-value) = 1.4865062530219122e-05 , rank deficiency = 1 , for 7 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.9897872350317932 , max(X p-value) = 1.8217303122801292e-08 , rank deficiency = 1 , for 6 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9891966780259537 , max(X p-value) = 1.025023749565301e-10 , rank deficiency = 1 , for 5 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = 0.98739470912747 , max(X p-value) = 1.8407472572169204e-11 , rank deficiency = 1 , for 4 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9863027039016103 , max(X p-value) = 0.18168186487153715 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9862944297439622 , max(X p-value) = 4.226466261235338e-52 , rank deficiency = 0 , for 2 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9766380723794102 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Restarting from best model (with 3 Xs & Adjusted R² = 0.9863027039016103) found so far...
Adding 0 2-way interactions among 1 untransformed variables in best model found so far:
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.9863027039016103 , max(X p-value) = 0.18168186487153715 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9862944297439622 , max(X p-value) = 4.226466261235338e-52 , rank deficiency = 0 , for 2 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9766380723794102 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Best model has 3 Xs (Adjusted R² = 0.9863027039016103 , rank deficiency = 0):
Results: Generalized linear model
=====================================================================
Model: GLM AIC: 2164.9639
Link Function: inverse_power BIC: -2033.6055
Dependent Variable: SPY_Close_Price Log-Likelihood: -1078.5
Date: 2021-04-22 13:47 LL-Null: -11641.
No. Observations: 351 Deviance: 0.087348
Df Model: 3 Pearson chi2: 0.0857
Df Residuals: 347 Scale: 0.00024699
Method: IRLS
---------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------
const 0.0067 0.0001 95.5663 0.0000 0.0065 0.0068
SPY_Open_Price -0.0000 0.0000 -85.3642 0.0000 -0.0000 -0.0000
cumAdmissions_sqar 0.0000 0.0000 14.8407 0.0000 0.0000 0.0000
FTSE_Open_Price_sqrt -0.0000 0.0000 -1.3356 0.1817 -0.0000 0.0000
=====================================================================
Descending order of 3 X's significance, assuming Gamma error distribution:
Coefficient z-stat
const 6.658417e-03 95.566302
SPY_Open_Price -1.065223e-05 -85.364178
cumAdmissions_sqar 8.676885e-16 14.840660
FTSE_Open_Price_sqrt -1.320934e-06 -1.335594
Rank deficiency = 0: Df Model (3) is same as number of Xs (3).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Gamma fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 3.5313688027259356
Root Mean Squared Residual = 4.676660400588652
R² = 0.9864201092967394
Gamma prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 4.0059024059427735
Root Mean Squared Error = 5.233511402040004
R² = 0.9801846278903824
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 6
GLM Model Fitted = InverseGaussian
1 entered.
Assuming InverseGaussian error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:47 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
InverseGaussian fit including transformed Xs:
Adjusted R² = -inf , max(X p-value) = 0.4239271818639885 , rank deficiency = 36 , for 51 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.8239852061463475 , max(X p-value) = 0.0015982574774210834 , rank deficiency = 38 , for 50 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = 0.8243443108689608 , max(X p-value) = 0.006619063192685558 , rank deficiency = 38 , for 49 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.8326225169012338 , max(X p-value) = 0.8760619551305862 , rank deficiency = 37 , for 48 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.8329486334674152 , max(X p-value) = 0.044934694232213905 , rank deficiency = 36 , for 47 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.8339439449207757 , max(X p-value) = 0.00042144202111776553 , rank deficiency = 35 , for 46 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.8327478946691927 , max(X p-value) = 4.269122276011347e-08 , rank deficiency = 34 , for 45 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -inf , max(X p-value) = 0.9355671781405178 , rank deficiency = 24 , for 44 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -inf , max(X p-value) = 0.8682456144982447 , rank deficiency = 23 , for 43 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.8453453901970247 , rank deficiency = 22 , for 42 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = -inf , max(X p-value) = 0.8458246513018349 , rank deficiency = 22 , for 41 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.8680403818913306 , rank deficiency = 21 , for 40 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = -inf , max(X p-value) = 0.7001197607089971 , rank deficiency = 20 , for 39 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9646871971689904 , max(X p-value) = 0.9842375042952367 , rank deficiency = 22 , for 38 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = -inf , max(X p-value) = 0.9078467017633014 , rank deficiency = 20 , for 37 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = -inf , max(X p-value) = 0.9356511371952976 , rank deficiency = 20 , for 36 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -inf , max(X p-value) = 0.7281189103093106 , rank deficiency = 20 , for 35 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = -inf , max(X p-value) = 0.9227319750237871 , rank deficiency = 19 , for 34 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9634810245125205 , max(X p-value) = 0.7080333152024623 , rank deficiency = 21 , for 33 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9634292687668389 , max(X p-value) = 0.6460002768371073 , rank deficiency = 20 , for 32 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.9635200362935271 , max(X p-value) = 0.9169017699812878 , rank deficiency = 20 , for 31 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9636471878814946 , max(X p-value) = 0.5096050075596466 , rank deficiency = 19 , for 30 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9635834092803823 , max(X p-value) = 0.3704545927605809 , rank deficiency = 18 , for 29 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9635839617666314 , max(X p-value) = 0.35625393939720684 , rank deficiency = 17 , for 28 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.963558785947793 , max(X p-value) = 0.3317796601357905 , rank deficiency = 16 , for 27 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9635582013762255 , max(X p-value) = 0.3164164927009907 , rank deficiency = 15 , for 26 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9633811636896545 , max(X p-value) = 0.2461218192960295 , rank deficiency = 14 , for 25 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9637639305806475 , max(X p-value) = 0.18276031615064936 , rank deficiency = 14 , for 24 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9637596057315688 , max(X p-value) = 0.015666054083568647 , rank deficiency = 13 , for 23 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9637598558690957 , max(X p-value) = 0.00965337666659057 , rank deficiency = 12 , for 22 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9632839547247531 , max(X p-value) = 0.01565071185329206 , rank deficiency = 11 , for 21 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9616711380923312 , max(X p-value) = 0.4617674097985578 , rank deficiency = 11 , for 20 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9616710026139755 , max(X p-value) = 0.30037564577409637 , rank deficiency = 10 , for 19 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9616710012993169 , max(X p-value) = 0.030509818821458885 , rank deficiency = 9 , for 18 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9603497165253897 , max(X p-value) = 0.0007492799010965026 , rank deficiency = 9 , for 17 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9589567876113377 , max(X p-value) = 7.221440365979855e-06 , rank deficiency = 8 , for 16 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.9547224065419012 , max(X p-value) = 1.2652032282139396e-18 , rank deficiency = 8 , for 15 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9537766323056778 , max(X p-value) = 0.1257208897081157 , rank deficiency = 8 , for 14 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9539936761565706 , max(X p-value) = 1.5272580106098165e-30 , rank deficiency = 7 , for 13 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9337899456330223 , max(X p-value) = 1.7666958844998874e-79 , rank deficiency = 7 , for 12 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9273258439174662 , max(X p-value) = 0.8454140564387266 , rank deficiency = 0 , for 11 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9277894245564721 , max(X p-value) = 0.6155844492362426 , rank deficiency = 0 , for 10 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9281120305194024 , max(X p-value) = 0.3573930504772832 , rank deficiency = 0 , for 9 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9280467964767873 , max(X p-value) = 0.2709444170641685 , rank deficiency = 0 , for 8 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9277785936533284 , max(X p-value) = 0.25850923601917364 , rank deficiency = 0 , for 7 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.9276073540723047 , max(X p-value) = 0.21554713073242382 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9279420737886087 , max(X p-value) = 0.09615665116879488 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9271310234394854 , max(X p-value) = 0.04393214512186664 , rank deficiency = 0 , for 4 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9261227222738488 , max(X p-value) = 2.760936371180455e-38 , rank deficiency = 0 , for 3 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.8976408037723401 , max(X p-value) = 3.219358527416735e-213 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.6864465463083862 , max(X p-value) = 5.0423572435754175e-102 , rank deficiency = 0 , for 1 Xs.
Variable left: Unnamed__0_sqrt
Restarting from best model (with 9 Xs & Adjusted R² = 0.9281120305194024) found so far...
Adding 1 2-way interactions among 2 untransformed variables in best model found so far:
cumVirusTests_x_cumAdmissions
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.9572042037148962 , max(X p-value) = 0.8024653334103444 , rank deficiency = 5 , for 10 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9571757149365402 , max(X p-value) = 0.6302172157987631 , rank deficiency = 4 , for 9 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9571234312696717 , max(X p-value) = 0.24884984527764897 , rank deficiency = 3 , for 8 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9567549327444309 , max(X p-value) = 0.023991837333989945 , rank deficiency = 2 , for 7 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9564058378450286 , max(X p-value) = 6.02123345989745e-126 , rank deficiency = 1 , for 6 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.8842816062833381 , max(X p-value) = 5.432274437679757e-16 , rank deficiency = 1 , for 5 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.8694151456760502 , max(X p-value) = 1.8950309637453057e-86 , rank deficiency = 1 , for 4 Xs.
Variable to drop: cumVirusTests_x_cumAdmissions
Adjusted R² = 0.7117103519586268 , max(X p-value) = 2.988620956161278e-05 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.7038822829652455 , max(X p-value) = 6.195024535527522e-31 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.6139130384743142 , max(X p-value) = 7.874197071923149e-133 , rank deficiency = 0 , for 1 Xs.
Variable left: cumVirusTests
Best model has 9 Xs (Adjusted R² = 0.9281120305194024 , rank deficiency = 0):
Results: Generalized linear model
===========================================================================
Model: GLM AIC: 2719.6756
Link Function: inverse_squared BIC: -1998.5269
Dependent Variable: SPY_Close_Price Log-Likelihood: -1349.8
Date: 2021-04-22 13:47 LL-Null: -3430.0
No. Observations: 351 Deviance: 0.0012368
Df Model: 9 Pearson chi2: 0.00125
Df Residuals: 341 Scale: 3.6543e-06
Method: IRLS
---------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------------
const 0.0000 0.0000 4.3019 0.0000 0.0000 0.0000
cumCasesByPublishDate_sqrt 0.0000 0.0000 6.9646 0.0000 0.0000 0.0000
cumVirusTests 0.0000 0.0000 1.2567 0.2089 -0.0000 0.0000
cumAdmissions -0.0000 0.0000 -2.2390 0.0252 -0.0000 -0.0000
Unnamed__0_sqrt -0.0000 0.0000 -23.8989 0.0000 -0.0000 -0.0000
FTSE_Close_Price_sqrt -0.0000 0.0000 -1.9295 0.0537 -0.0000 0.0000
FTSE_Low_Price_sqar -0.0000 0.0000 -3.5858 0.0003 -0.0000 -0.0000
FTSE_Open_Price_sqrt -0.0000 0.0000 -0.9203 0.3574 -0.0000 0.0000
FTSE_High_Price_sqrt 0.0000 0.0000 1.5269 0.1268 -0.0000 0.0000
Sun_Hours_sqrt 0.0000 0.0000 1.0999 0.2714 -0.0000 0.0000
===========================================================================
Descending order of 9 X's significance, assuming InverseGaussian error distribution:
Coefficient z-stat
const 3.315614e-05 4.301903
Unnamed__0_sqrt -5.988687e-07 -23.898859
cumCasesByPublishDate_sqrt 2.670837e-09 6.964581
FTSE_Low_Price_sqar -2.174803e-13 -3.585801
cumAdmissions -4.147723e-12 -2.239006
FTSE_Close_Price_sqrt -2.424056e-07 -1.929481
FTSE_High_Price_sqrt 2.207546e-07 1.526854
cumVirusTests 4.844868e-15 1.256698
Sun_Hours_sqrt 5.332357e-08 1.099890
FTSE_Open_Price_sqrt -9.679896e-08 -0.920344
Rank deficiency = 0: Df Model (9) is same as number of Xs (9).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
InverseGaussian fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 8.169597701299095
Root Mean Squared Residual = 10.631463358061604
R² = 0.9299605783060463
InverseGaussian prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 9.412586038800576
Root Mean Squared Error = 12.100419968372714
R² = 0.8969264877456589
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 7
GLM Model Fitted = NegativeBinomial
1 entered.
Assuming NegativeBinomial error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:48 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
NegativeBinomial fit including transformed Xs:
Adjusted R² = -inf , max(X p-value) = 0.4239271818639885 , rank deficiency = 36 , for 51 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.25851673598495795 , max(X p-value) = 0.8110684493247655 , rank deficiency = 38 , for 50 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.9784826173832215 , max(X p-value) = 0.9940991759101296 , rank deficiency = 29 , for 49 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9811446478829835 , max(X p-value) = 0.9979818698233138 , rank deficiency = 29 , for 48 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9178917320677753 , rank deficiency = 27 , for 47 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = -inf , max(X p-value) = 0.869446506965223 , rank deficiency = 25 , for 46 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9796107194387808 , max(X p-value) = 0.9997509695918703 , rank deficiency = 27 , for 45 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9509546841572301 , max(X p-value) = 0.9998630612289565 , rank deficiency = 27 , for 44 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9595534548358368 , max(X p-value) = 0.9995119572147023 , rank deficiency = 26 , for 43 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9594856699682751 , max(X p-value) = 0.9992637281738792 , rank deficiency = 26 , for 42 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = -inf , max(X p-value) = 0.9824804696553304 , rank deficiency = 23 , for 41 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = -inf , max(X p-value) = 0.9479835642730126 , rank deficiency = 22 , for 40 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.8191858803950456 , rank deficiency = 21 , for 39 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -0.0431671966720637 , max(X p-value) = 0.9691993999415717 , rank deficiency = 23 , for 38 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = -0.04316019137917371 , max(X p-value) = 0.9862121076564154 , rank deficiency = 22 , for 37 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = -0.040054233385155014 , max(X p-value) = 0.984720754938564 , rank deficiency = 22 , for 36 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = -inf , max(X p-value) = 0.9658192873317433 , rank deficiency = 19 , for 35 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9635800787235222 , max(X p-value) = 0.999966980775433 , rank deficiency = 21 , for 34 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9636028528804653 , max(X p-value) = 0.9956579813943889 , rank deficiency = 20 , for 33 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.9639098844675889 , max(X p-value) = 0.9907734786889844 , rank deficiency = 20 , for 32 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9639378252655619 , max(X p-value) = 0.9990334860229483 , rank deficiency = 20 , for 31 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9639346883408123 , max(X p-value) = 0.9750923972775037 , rank deficiency = 19 , for 30 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9640819846501361 , max(X p-value) = 0.9733143840608784 , rank deficiency = 19 , for 29 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9640368307827757 , max(X p-value) = 0.9732105237707079 , rank deficiency = 18 , for 28 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9638299123053033 , max(X p-value) = 0.962139530459453 , rank deficiency = 17 , for 27 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9627571422518552 , max(X p-value) = 0.9954971061847963 , rank deficiency = 16 , for 26 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = 0.9627957108985886 , max(X p-value) = 0.959984086393447 , rank deficiency = 16 , for 25 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9628213344972945 , max(X p-value) = 0.9484199321842063 , rank deficiency = 16 , for 24 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9762523892779772 , max(X p-value) = 0.9979316310424023 , rank deficiency = 4 , for 23 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9762548139279352 , max(X p-value) = 0.9980270782777162 , rank deficiency = 3 , for 22 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9763265146849678 , max(X p-value) = 0.9951417836322906 , rank deficiency = 3 , for 21 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9763146459382227 , max(X p-value) = 0.9879061899933536 , rank deficiency = 2 , for 20 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9763315425688514 , max(X p-value) = 0.9845371527219251 , rank deficiency = 2 , for 19 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.9763959758770053 , max(X p-value) = 0.9699730019739401 , rank deficiency = 2 , for 18 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.9760877424976763 , max(X p-value) = 0.9701908674624365 , rank deficiency = 2 , for 17 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9757699230726944 , max(X p-value) = 0.9931395761853153 , rank deficiency = 2 , for 16 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.975827186070108 , max(X p-value) = 0.9788162268157002 , rank deficiency = 1 , for 15 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9758481504250733 , max(X p-value) = 0.9670121423209674 , rank deficiency = 1 , for 14 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9757514736577624 , max(X p-value) = 0.9684389832323271 , rank deficiency = 1 , for 13 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9755543674409409 , max(X p-value) = 0.9588921423126838 , rank deficiency = 1 , for 12 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9755351857664977 , max(X p-value) = 0.9685492806999143 , rank deficiency = 1 , for 11 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9752872967710653 , max(X p-value) = 0.9647552463670731 , rank deficiency = 1 , for 10 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.9748501389293323 , max(X p-value) = 0.9616272010893153 , rank deficiency = 1 , for 9 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9746231689951188 , max(X p-value) = 0.9226021263361994 , rank deficiency = 0 , for 8 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.973234550814783 , max(X p-value) = 0.933743877637395 , rank deficiency = 0 , for 7 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.9719270232508703 , max(X p-value) = 0.9001923141453096 , rank deficiency = 0 , for 6 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9677305421548326 , max(X p-value) = 0.959845223696758 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9679884720554267 , max(X p-value) = 0.7492933998310836 , rank deficiency = 0 , for 4 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9467812699416305 , max(X p-value) = 0.5085487392799516 , rank deficiency = 0 , for 3 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.867595086674129 , max(X p-value) = 0.3653865169085685 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.7470842063486482 , max(X p-value) = 0.05339316211284266 , rank deficiency = 0 , for 1 Xs.
Variable left: US_Covid_Deaths
Restarting from best model (with 8 Xs & Adjusted R² = 0.9746231689951188) found so far...
Adding 3 2-way interactions among 3 untransformed variables in best model found so far:
US_Covid_Deaths_x_Unnamed__0
US_Covid_Deaths_x_cumAdmissions
Unnamed__0_x_cumAdmissions
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.97558850206319 , max(X p-value) = 0.9999740062865784 , rank deficiency = 0 , for 11 Xs.
Variable to drop: US_Covid_Deaths_x_Unnamed__0
Adjusted R² = 0.9756604895566179 , max(X p-value) = 0.9975799781990787 , rank deficiency = 0 , for 10 Xs.
Variable to drop: Unnamed__0_x_cumAdmissions
Adjusted R² = 0.9757299963056252 , max(X p-value) = 0.9805167793117081 , rank deficiency = 0 , for 9 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9754334785361856 , max(X p-value) = 0.9025347596066564 , rank deficiency = 0 , for 8 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9728432494389669 , max(X p-value) = 0.9377425110521883 , rank deficiency = 0 , for 7 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.971576896593853 , max(X p-value) = 0.9076132574097945 , rank deficiency = 0 , for 6 Xs.
Variable to drop: US_Covid_Deaths_x_cumAdmissions
Adjusted R² = 0.9677305421548326 , max(X p-value) = 0.959845223696758 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9679884720554267 , max(X p-value) = 0.7492933998310836 , rank deficiency = 0 , for 4 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9467812699416305 , max(X p-value) = 0.5085487392799516 , rank deficiency = 0 , for 3 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.867595086674129 , max(X p-value) = 0.3653865169085685 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.7470842063486482 , max(X p-value) = 0.05339316211284266 , rank deficiency = 0 , for 1 Xs.
Variable left: US_Covid_Deaths
Best model has 9 Xs (Adjusted R² = 0.9757299963056252 , rank deficiency = 0):
Results: Generalized linear model
===============================================================================
Model: GLM AIC: 4796.8792
Link Function: log BIC: -1998.3845
Dependent Variable: SPY_Close_Price Log-Likelihood: -2388.4
Date: 2021-04-22 13:48 LL-Null: -2391.0
No. Observations: 351 Deviance: 0.14359
Df Model: 9 Pearson chi2: 0.144
Df Residuals: 341 Scale: 1.0000
Method: IRLS
-------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------
const 4.0767 3.4476 1.1825 0.2370 -2.6804 10.8338
Unnamed__0_sqar 0.0000 0.0001 0.1190 0.9053 -0.0001 0.0001
US_Covid_Deaths 0.0000 0.0000 0.2311 0.8173 -0.0000 0.0000
Unnamed__0 -0.0024 0.0229 -0.1047 0.9166 -0.0472 0.0424
cumAdmissions -0.0000 0.0000 -0.1745 0.8615 -0.0001 0.0001
US_Covid_Deaths_sqar -0.0000 0.0000 -0.0758 0.9395 -0.0000 0.0000
US_Covid_Deaths_x_cumAdmissions 0.0000 0.0000 0.0553 0.9559 -0.0000 0.0000
cumAdmissions_sqrt 0.0023 0.0157 0.1442 0.8854 -0.0286 0.0331
cumAdmissions_sqar -0.0000 0.0000 -0.0244 0.9805 -0.0000 0.0000
FTSE_Close_Price_sqrt 0.0198 0.0390 0.5062 0.6127 -0.0567 0.0962
===============================================================================
Descending order of 9 X's significance, assuming NegativeBinomial error distribution:
Coefficient z-stat
const 4.076692e+00 1.182485
FTSE_Close_Price_sqrt 1.975185e-02 0.506204
US_Covid_Deaths 4.332912e-06 0.231071
cumAdmissions -8.408623e-06 -0.174524
cumAdmissions_sqrt 2.268341e-03 0.144154
Unnamed__0_sqar 6.173242e-06 0.118989
Unnamed__0 -2.395461e-03 -0.104736
US_Covid_Deaths_sqar -1.892812e-11 -0.075849
US_Covid_Deaths_x_cumAdmissions 2.829563e-11 0.055282
cumAdmissions_sqar -6.235014e-12 -0.024421
Rank deficiency = 0: Df Model (9) is same as number of Xs (9).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
NegativeBinomial fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 4.633597811653204
Root Mean Squared Residual = 6.162924235322751
R² = 0.9763540821149091
NegativeBinomial prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 4.92380417012546
Root Mean Squared Error = 6.339034050842144
R² = 0.9700938531436452
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 8
GLM Model Fitted = Tweedie
1 entered.
Assuming Tweedie error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:48 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
Tweedie fit including transformed Xs:
Adjusted R² = 0.26337472323779754 , max(X p-value) = 8.485391324675111e-10 , rank deficiency = 38 , for 51 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = -0.030816358273417332 , max(X p-value) = 0.9484997386486083 , rank deficiency = 29 , for 50 Xs.
Variable to drop: SPY_Open_Price
Adjusted R² = -0.06257667980959902 , max(X p-value) = 0.9999999933957445 , rank deficiency = 28 , for 49 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.2857659868619724 , max(X p-value) = 0.964311905013361 , rank deficiency = 28 , for 48 Xs.
Variable to drop: Unnamed__0
Adjusted R² = -inf , max(X p-value) = 0.9941271745915241 , rank deficiency = 26 , for 47 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = -0.05217639790477335 , max(X p-value) = 0.9999999449556624 , rank deficiency = 26 , for 46 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = -inf , max(X p-value) = 0.9034693326381602 , rank deficiency = 25 , for 45 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = -0.052050777710036966 , max(X p-value) = 0.9999997762071091 , rank deficiency = 25 , for 44 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = -inf , max(X p-value) = 0.8550122468912682 , rank deficiency = 23 , for 43 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = -inf , max(X p-value) = 0.8630651980178177 , rank deficiency = 22 , for 42 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = -inf , max(X p-value) = 0.7737468428459299 , rank deficiency = 21 , for 41 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.9692982850222289 , rank deficiency = 20 , for 40 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = -0.028000426423533176 , max(X p-value) = 0.9999998088719391 , rank deficiency = 23 , for 39 Xs.
Variable to drop: cumVirusTests
Adjusted R² = -inf , max(X p-value) = 0.8824944686318188 , rank deficiency = 19 , for 38 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.6727081726907724 , max(X p-value) = 0.9877599367315504 , rank deficiency = 23 , for 37 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = -inf , max(X p-value) = 0.9294611391117007 , rank deficiency = 19 , for 36 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = -0.04112649063558749 , max(X p-value) = 0.9999875438492379 , rank deficiency = 21 , for 35 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = -inf , max(X p-value) = 0.996768103843435 , rank deficiency = 18 , for 34 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = -inf , max(X p-value) = 0.8801681923655718 , rank deficiency = 17 , for 33 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9609051109106718 , max(X p-value) = 0.905688615427734 , rank deficiency = 20 , for 32 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.960732665314362 , max(X p-value) = 0.8567351232316983 , rank deficiency = 20 , for 31 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9608692135799667 , max(X p-value) = 0.37765521249982215 , rank deficiency = 19 , for 30 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9610112834439053 , max(X p-value) = 0.08764367946462114 , rank deficiency = 19 , for 29 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9606131655672178 , max(X p-value) = 0.08736035912062151 , rank deficiency = 19 , for 28 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9603992598425083 , max(X p-value) = 0.1841867341895489 , rank deficiency = 18 , for 27 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9601788248137411 , max(X p-value) = 0.05766483388156418 , rank deficiency = 17 , for 26 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.979611991207814 , max(X p-value) = 0.9976340301154789 , rank deficiency = 9 , for 25 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9797473308925067 , max(X p-value) = 0.9188655981308715 , rank deficiency = 9 , for 24 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.979746789741994 , max(X p-value) = 0.8420767651177772 , rank deficiency = 8 , for 23 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.9797459276627567 , max(X p-value) = 0.8234145262640392 , rank deficiency = 7 , for 22 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9797397232492825 , max(X p-value) = 0.4367566441080295 , rank deficiency = 6 , for 21 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9796952183485478 , max(X p-value) = 0.36403110713713693 , rank deficiency = 5 , for 20 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9796924259748909 , max(X p-value) = 0.3231163902357249 , rank deficiency = 5 , for 19 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.9796329029192942 , max(X p-value) = 0.018789696284837325 , rank deficiency = 5 , for 18 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9793629032059366 , max(X p-value) = 0.0007175786885975456 , rank deficiency = 5 , for 17 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.9788688094758122 , max(X p-value) = 0.01834388513559004 , rank deficiency = 4 , for 16 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9785684841853032 , max(X p-value) = 0.00021696400543255392 , rank deficiency = 3 , for 15 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9775976538969147 , max(X p-value) = 0.06656634970808796 , rank deficiency = 3 , for 14 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9773841408569733 , max(X p-value) = 6.631810212878975e-07 , rank deficiency = 3 , for 13 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.975868174704539 , max(X p-value) = 0.004422192704661826 , rank deficiency = 2 , for 12 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.9752634381120173 , max(X p-value) = 1.3476944716767852e-12 , rank deficiency = 2 , for 11 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9719940021533227 , max(X p-value) = 0.028973829836223014 , rank deficiency = 2 , for 10 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9716861481720236 , max(X p-value) = 0.0006121640991778728 , rank deficiency = 1 , for 9 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9704731213037017 , max(X p-value) = 6.239182155125863e-05 , rank deficiency = 1 , for 8 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9694338508262417 , max(X p-value) = 2.698752550201499e-08 , rank deficiency = 1 , for 7 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9665228403913103 , max(X p-value) = 1.1878462801892936e-25 , rank deficiency = 1 , for 6 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9563855396045312 , max(X p-value) = 3.145908267169609e-13 , rank deficiency = 1 , for 5 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.9552977003165088 , max(X p-value) = 0.09071568629534765 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.954978130151698 , max(X p-value) = 1.7745444134069286e-87 , rank deficiency = 0 , for 3 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9075880765771066 , max(X p-value) = 1.148446730037334e-135 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.76293616701704 , max(X p-value) = 8.4188613655406e-218 , rank deficiency = 0 , for 1 Xs.
Variable left: US_Covid_Cases_sqrt
Restarting from best model (with 4 Xs & Adjusted R² = 0.9552977003165088) found so far...
Adding 1 2-way interactions among 2 untransformed variables in best model found so far:
cumCasesByPublishDate_x_FTSE_Close_Price
X pairs with correlations > 0.995 :
cumCasesByPublishDate_x_FTSE_Close_Price , cumCasesByPublishDate
1 variables considered for deletion:
cumCasesByPublishDate_x_FTSE_Close_Price
X pairs with correlations > 0.995 :
(no more)
1 interaction variables deleted.
Adjusted R² = 0.9552977003165088 , max(X p-value) = 0.09071568629534765 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.954978130151698 , max(X p-value) = 1.7745444134069286e-87 , rank deficiency = 0 , for 3 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9075880765771066 , max(X p-value) = 1.148446730037334e-135 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.76293616701704 , max(X p-value) = 8.4188613655406e-218 , rank deficiency = 0 , for 1 Xs.
Variable left: US_Covid_Cases_sqrt
Best model has 4 Xs (Adjusted R² = 0.9552977003165088 , rank deficiency = 0):
Results: Generalized linear model
======================================================================
Model: GLM AIC: nan
Link Function: log BIC: -1951.4692
Dependent Variable: SPY_Close_Price Log-Likelihood: nan
Date: 2021-04-22 13:48 LL-Null: nan
No. Observations: 351 Deviance: 76.363
Df Model: 4 Pearson chi2: 76.4
Df Residuals: 346 Scale: 0.22090
Method: IRLS
----------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
----------------------------------------------------------------------
const 6.3627 0.9340 6.8125 0.0000 4.5321 8.1932
US_Covid_Cases_sqrt 0.0001 0.0000 34.6644 0.0000 0.0001 0.0001
cumCasesByPublishDate -0.0000 0.0000 -18.6737 0.0000 -0.0000 -0.0000
FTSE_Close_Price_sqrt -0.0397 0.0235 -1.6916 0.0907 -0.0858 0.0063
FTSE_Close_Price 0.0004 0.0001 2.6172 0.0089 0.0001 0.0007
======================================================================
Descending order of 4 X's significance, assuming Tweedie error distribution:
Coefficient z-stat
const 6.362686e+00 6.812508
US_Covid_Cases_sqrt 9.261921e-05 34.664380
cumCasesByPublishDate -5.502792e-08 -18.673723
FTSE_Close_Price 3.852609e-04 2.617186
FTSE_Close_Price_sqrt -3.974398e-02 -1.691634
Rank deficiency = 0: Df Model (4) is same as number of Xs (4).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Tweedie fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 6.522954856507443
Root Mean Squared Residual = 8.425833338411111
R² = 0.955808583741463
Tweedie prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 6.772154464555934
Root Mean Squared Error = 8.798627262724867
R² = 0.9454408645798134
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 9
GLM Model Fitted = GLM Gaussian
1 entered.
Assuming GLM Gaussian error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:48 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
GLM Gaussian fit including transformed Xs:
Adjusted R² = 0.9521687178749622 , max(X p-value) = 0.9991590917301658 , rank deficiency = 38 , for 51 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.9458928632088003 , max(X p-value) = 0.5030354421003307 , rank deficiency = 38 , for 50 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9491981768157216 , max(X p-value) = 0.6514193998060303 , rank deficiency = 38 , for 49 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.9496790847766674 , max(X p-value) = 0.3450540609297422 , rank deficiency = 38 , for 48 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.9501355528854897 , max(X p-value) = 0.24504116982231738 , rank deficiency = 37 , for 47 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9502454538882703 , max(X p-value) = 0.01416628507439223 , rank deficiency = 37 , for 46 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9499647791732948 , max(X p-value) = 0.0012082663368496325 , rank deficiency = 37 , for 45 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9499422178112109 , max(X p-value) = 6.062416449169565e-06 , rank deficiency = 36 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9489327023352171 , max(X p-value) = 0.0008566559792864465 , rank deficiency = 35 , for 43 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9489687419683277 , max(X p-value) = 0.00016389929135335205 , rank deficiency = 34 , for 42 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.9476591628537214 , max(X p-value) = 4.180837734068112e-06 , rank deficiency = 34 , for 41 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9466649115823093 , max(X p-value) = 0.004059820732383955 , rank deficiency = 33 , for 40 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9452045016596836 , max(X p-value) = 0.3385997201060783 , rank deficiency = 32 , for 39 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.9452502482365887 , max(X p-value) = 0.0005724312736614727 , rank deficiency = 31 , for 38 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9451700368653767 , max(X p-value) = 7.59181877069992e-12 , rank deficiency = 30 , for 37 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = 0.9421043630927839 , max(X p-value) = 2.1587988492541244e-42 , rank deficiency = 30 , for 36 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.9420902859457855 , max(X p-value) = 1.0051884961715618e-43 , rank deficiency = 29 , for 35 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9304765370100303 , max(X p-value) = 3.128327474244406e-31 , rank deficiency = 29 , for 34 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.9224815515490871 , max(X p-value) = 4.2669492352620333e-69 , rank deficiency = 29 , for 33 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.992986789702342 , max(X p-value) = 0.7109152366692485 , rank deficiency = 24 , for 32 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.993393784342271 , max(X p-value) = 0.8124429477419713 , rank deficiency = 23 , for 31 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9933950107805211 , max(X p-value) = 0.7657443421277565 , rank deficiency = 23 , for 30 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.9934212945964804 , max(X p-value) = 0.9885008415845726 , rank deficiency = 22 , for 29 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9934259439640759 , max(X p-value) = 0.9738587428583876 , rank deficiency = 21 , for 28 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9931740574752974 , max(X p-value) = 0.7248485476689062 , rank deficiency = 20 , for 27 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.9934906618378911 , max(X p-value) = 0.5948709682074692 , rank deficiency = 20 , for 26 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.993503219184722 , max(X p-value) = 0.13602404700044038 , rank deficiency = 20 , for 25 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9936054598071008 , max(X p-value) = 0.3840966734214135 , rank deficiency = 19 , for 24 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9933853135182988 , max(X p-value) = 0.6506195064174154 , rank deficiency = 18 , for 23 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.993471988485031 , max(X p-value) = 0.6858201917781456 , rank deficiency = 17 , for 22 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9935383716526485 , max(X p-value) = 0.8626571928726365 , rank deficiency = 16 , for 21 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9935361233436539 , max(X p-value) = 0.5328127893210027 , rank deficiency = 15 , for 20 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9935258595305695 , max(X p-value) = 0.32293863270606105 , rank deficiency = 14 , for 19 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9932735619015912 , max(X p-value) = 0.8125738542807013 , rank deficiency = 13 , for 18 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.993266507649893 , max(X p-value) = 0.2422451322529977 , rank deficiency = 12 , for 17 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9933673237181243 , max(X p-value) = 0.00022173518518576153 , rank deficiency = 12 , for 16 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9931351192617246 , max(X p-value) = 0.017808503553657953 , rank deficiency = 12 , for 15 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9933930706265588 , max(X p-value) = 0.5280589901321804 , rank deficiency = 0 , for 14 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9934048704581153 , max(X p-value) = 0.584014354377572 , rank deficiency = 0 , for 13 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.9934185331158425 , max(X p-value) = 0.519942519831301 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9934299097865403 , max(X p-value) = 0.7238897359216616 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.99344682204506 , max(X p-value) = 0.12493991924784437 , rank deficiency = 0 , for 10 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9934207962140935 , max(X p-value) = 0.13026897053812833 , rank deficiency = 0 , for 9 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.9933959934558393 , max(X p-value) = 0.36837606463035844 , rank deficiency = 0 , for 8 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9933996683032318 , max(X p-value) = 0.09491947982747762 , rank deficiency = 0 , for 7 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.993365344697354 , max(X p-value) = 0.0703963328453047 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9933216183820517 , max(X p-value) = 0.10525290484886904 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9932902707374489 , max(X p-value) = 0.015254146024860263 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9931957761886894 , max(X p-value) = 3.8933493736165473e-10 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9924495478708685 , max(X p-value) = 0.6344145836600983 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9924662903673915 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Restarting from best model (with 10 Xs & Adjusted R² = 0.99344682204506) found so far...
Adding 6 2-way interactions among 4 untransformed variables in best model found so far:
SPY_Open_Price_x_cumVirusTests
SPY_Open_Price_x_FTSE_Low_Price
SPY_Open_Price_x_FTSE_Close_Price
cumVirusTests_x_FTSE_Low_Price
cumVirusTests_x_FTSE_Close_Price
FTSE_Low_Price_x_FTSE_Close_Price
X pairs with correlations > 0.995 :
SPY_Open_Price_x_cumVirusTests , cumVirusTests_x_FTSE_Close_Price
SPY_Open_Price_x_FTSE_Low_Price , SPY_Open_Price_x_FTSE_Close_Price
2 variables considered for deletion:
cumVirusTests_x_FTSE_Close_Price
SPY_Open_Price_x_FTSE_Close_Price
X pairs with correlations > 0.995 :
SPY_Open_Price_x_cumVirusTests , cumVirusTests_x_FTSE_Low_Price
1 variables considered for deletion:
SPY_Open_Price_x_cumVirusTests
X pairs with correlations > 0.995 :
(no more)
3 interaction variables deleted.
Adjusted R² = 0.9934416840404366 , max(X p-value) = 0.8013245035516933 , rank deficiency = 2 , for 13 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.993459753394557 , max(X p-value) = 0.2941622579525536 , rank deficiency = 2 , for 12 Xs.
Variable to drop: SPY_Open_Price_x_FTSE_Low_Price
Adjusted R² = 0.9934578565750812 , max(X p-value) = 0.1547006786768103 , rank deficiency = 2 , for 11 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9934413558958652 , max(X p-value) = 0.19410321988533386 , rank deficiency = 1 , for 10 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9934281964999098 , max(X p-value) = 0.18199536537601535 , rank deficiency = 1 , for 9 Xs.
Variable to drop: FTSE_Low_Price_x_FTSE_Close_Price
Adjusted R² = 0.993413227839378 , max(X p-value) = 0.14079246867211612 , rank deficiency = 1 , for 8 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.993371570319278 , max(X p-value) = 0.10446508688756267 , rank deficiency = 0 , for 7 Xs.
Variable to drop: cumVirusTests_x_FTSE_Low_Price
Adjusted R² = 0.9933400428782285 , max(X p-value) = 0.1350790840353481 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9933162380674546 , max(X p-value) = 0.04400453598516331 , rank deficiency = 0 , for 5 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9932571971193325 , max(X p-value) = 0.04136836918963732 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.9931957761886894 , max(X p-value) = 3.8933493736165473e-10 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9924495478708685 , max(X p-value) = 0.6344145836600983 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9924662903673915 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Best model has 5 Xs (Adjusted R² = 0.9933162380674546 , rank deficiency = 0):
Results: Generalized linear model
=======================================================================
Model: GLM AIC: 1836.0903
Link Function: identity BIC: 1691.5939
Dependent Variable: SPY_Close_Price Log-Likelihood: -912.05
Date: 2021-04-22 13:48 LL-Null: -26922.
No. Observations: 351 Deviance: 3713.6
Df Model: 5 Pearson chi2: 3.71e+03
Df Residuals: 345 Scale: 10.764
Method: IRLS
-----------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-----------------------------------------------------------------------
const -4.6953 5.1132 -0.9183 0.3585 -14.7169 5.3263
SPY_Open_Price 1.0064 0.0063 160.7588 0.0000 0.9941 1.0187
FTSE_Low_Price_sqrt -3.5109 0.6694 -5.2452 0.0000 -4.8228 -2.1990
FTSE_Close_Price_sqrt 4.2477 0.6251 6.7957 0.0000 3.0226 5.4728
FTSE_Open_Price_sqrt 0.9623 0.4778 2.0140 0.0440 0.0258 1.8988
FTSE_High_Price_sqrt -1.6756 0.5832 -2.8730 0.0041 -2.8187 -0.5325
=======================================================================
Descending order of 5 X's significance, assuming GLM Gaussian error distribution:
Coefficient z-stat
const -4.695333 -0.918283
SPY_Open_Price 1.006405 160.758791
FTSE_Close_Price_sqrt 4.247670 6.795701
FTSE_Low_Price_sqrt -3.510914 -5.245213
FTSE_High_Price_sqrt -1.675599 -2.872965
FTSE_Open_Price_sqrt 0.962302 2.014048
Rank deficiency = 0: Df Model (5) is same as number of Xs (5).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
GLM Gaussian fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 2.428283384639719
Root Mean Squared Residual = 3.2526846704549124
R² = 0.9934117203807767
GLM Gaussian prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 2.2863983563103485
Root Mean Squared Error = 3.156815927725248
R² = 0.9927266968583087
Plots of train-set fit & test-set predict:
========================================================== Next GLM Model ==============================================================
counter = 10
GLM Model Fitted = Normal
1 entered.
Assuming Normal error distribution.
X pairs with correlations > 0.995 :
(no more)
X pairs with correlations > 0.995 :
US_Covid_Cases_sqar , cumAdmissions_sqar
SPY_Open_Price_sqrt , SPY_Open_Price
2 variables considered for deletion:
US_Covid_Cases_sqar
SPY_Open_Price_sqrt
X pairs with correlations > 0.995 :
SPY_Open_Price_sqar , SPY_Open_Price
1 variables considered for deletion:
SPY_Open_Price_sqar
X pairs with correlations > 0.995 :
(no more)
3 transformed variables deleted.
Fit using reproducible random 80% (x_train & y_train) of data rows:
OLS fit including only 18 untransformed Xs:
Results: Ordinary least squares
==========================================================================================
Model: OLS Adj. R-squared: 0.994
Dependent Variable: SPY_Close_Price AIC: 1835.0027
Date: 2021-04-22 13:48 BIC: 1908.3576
No. Observations: 351 Log-Likelihood: -898.50
Df Model: 18 F-statistic: 3006.
Df Residuals: 332 Prob (F-statistic): 0.00
R-squared: 0.994 Scale: 10.355
------------------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
Intercept -13.3843 8.5592 -1.5637 0.1188 -30.2213 3.4528
SPY_Open_Price 0.9383 0.0297 31.5925 0.0000 0.8799 0.9967
US_Covid_Deaths -0.0000 0.0000 -1.1948 0.2330 -0.0001 0.0000
US_Covid_Cases 0.0000 0.0000 1.8949 0.0590 -0.0000 0.0000
Unnamed__0 0.0294 0.0167 1.7603 0.0793 -0.0035 0.0622
cumVirusTests 0.0000 0.0000 0.6737 0.5010 -0.0000 0.0000
cumAdmissions -0.0001 0.0001 -1.1298 0.2594 -0.0003 0.0001
cumDailyNsoDeathsByDeathDate 0.0002 0.0002 1.2122 0.2263 -0.0002 0.0006
cumCasesByPublishDate -0.0000 0.0000 -0.1598 0.8731 -0.0000 0.0000
FTSE_Low_Price -0.0235 0.0048 -4.9218 0.0000 -0.0328 -0.0141
FTSE_Close_Price 0.0275 0.0042 6.6060 0.0000 0.0193 0.0357
FTSE_Open_Price 0.0054 0.0032 1.7088 0.0884 -0.0008 0.0117
FTSE_High_Price -0.0072 0.0049 -1.4791 0.1401 -0.0169 0.0024
cumPeopleVaccinatedCompleteByPublishDate 0.0000 0.0000 0.3621 0.7175 -0.0000 0.0000
FTSE_Volume -0.0000 0.0000 -2.4540 0.0146 -0.0000 -0.0000
Sun_Hours 0.0054 0.0264 0.2051 0.8377 -0.0465 0.0573
Rainfall_mm 0.0727 0.0381 1.9087 0.0572 -0.0022 0.1476
Max_Temperature_DegC 1.3880 0.9208 1.5073 0.1327 -0.4234 3.1994
Min_Temperature_DegC -1.4217 0.9891 -1.4374 0.1516 -3.3674 0.5240
------------------------------------------------------------------------------------------
Omnibus: 18.360 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 30.718
Skew: -0.340 Prob(JB): 0.000
Kurtosis: 4.280 Condition No.: 50134875179
==========================================================================================
* The condition number is large (5e+10). This might indicate strong
multicollinearity or other numerical problems.
Descending order of 18 X's significance, assuming Normal error distribution:
SPY_Open_Price
FTSE_Close_Price
FTSE_Low_Price
FTSE_Volume
Rainfall_mm
US_Covid_Cases
Unnamed__0
FTSE_Open_Price
Max_Temperature_DegC
FTSE_High_Price
Min_Temperature_DegC
cumDailyNsoDeathsByDeathDate
US_Covid_Deaths
cumAdmissions
cumVirusTests
cumPeopleVaccinatedCompleteByPublishDate
Sun_Hours
cumCasesByPublishDate
Rank deficiency = 0: Df Model (18) is same as number of Xs (18).
Normal fit including transformed Xs:
Adjusted R² = 0.9478169684655393 , max(X p-value) = 0.985954234936134 , rank deficiency = 36 , for 51 Xs.
Variable to drop: FTSE_Close_Price_sqar
Adjusted R² = 0.9405827032631453 , max(X p-value) = 0.66373541762269 , rank deficiency = 36 , for 50 Xs.
Variable to drop: FTSE_Volume
Adjusted R² = 0.9449206872053191 , max(X p-value) = 0.6350800931130927 , rank deficiency = 36 , for 49 Xs.
Variable to drop: FTSE_Low_Price_sqar
Adjusted R² = 0.945412610405055 , max(X p-value) = 0.5625501947137284 , rank deficiency = 36 , for 48 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqar
Adjusted R² = 0.9460985619275569 , max(X p-value) = 0.44068273057811835 , rank deficiency = 36 , for 47 Xs.
Variable to drop: US_Covid_Deaths_sqar
Adjusted R² = 0.9463047117785048 , max(X p-value) = 0.12047347853599437 , rank deficiency = 36 , for 46 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqar
Adjusted R² = 0.9458981795317696 , max(X p-value) = 0.036160816482777476 , rank deficiency = 35 , for 45 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate
Adjusted R² = 0.9458541569024241 , max(X p-value) = 0.00434626309597369 , rank deficiency = 34 , for 44 Xs.
Variable to drop: Sun_Hours_sqar
Adjusted R² = 0.9446916344097764 , max(X p-value) = 0.013819673576195943 , rank deficiency = 33 , for 43 Xs.
Variable to drop: cumAdmissions_sqrt
Adjusted R² = 0.9446782593994971 , max(X p-value) = 0.002169988412931058 , rank deficiency = 32 , for 42 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate
Adjusted R² = 0.943270258057936 , max(X p-value) = 0.001657086595347861 , rank deficiency = 32 , for 41 Xs.
Variable to drop: US_Covid_Deaths
Adjusted R² = 0.9413909519985548 , max(X p-value) = 0.019192324630964742 , rank deficiency = 31 , for 40 Xs.
Variable to drop: US_Covid_Cases
Adjusted R² = 0.9402125124635764 , max(X p-value) = 0.5383572875703694 , rank deficiency = 31 , for 39 Xs.
Variable to drop: US_Covid_Deaths_sqrt
Adjusted R² = 0.9402218848359302 , max(X p-value) = 0.030611062954542148 , rank deficiency = 30 , for 38 Xs.
Variable to drop: US_Covid_Cases_sqrt
Adjusted R² = 0.9402359005513438 , max(X p-value) = 3.008474636643294e-06 , rank deficiency = 29 , for 37 Xs.
Variable to drop: FTSE_Open_Price_sqar
Adjusted R² = 0.9366206385102381 , max(X p-value) = 2.033116147578988e-17 , rank deficiency = 29 , for 36 Xs.
Variable to drop: cumVirusTests_sqrt
Adjusted R² = 0.9366165884266224 , max(X p-value) = 1.915503472417556e-18 , rank deficiency = 28 , for 35 Xs.
Variable to drop: cumAdmissions_sqar
Adjusted R² = 0.9216335257590981 , max(X p-value) = 6.875739568743246e-11 , rank deficiency = 28 , for 34 Xs.
Variable to drop: cumCasesByPublishDate_sqar
Adjusted R² = 0.9115937418590563 , max(X p-value) = 5.147013825207387e-21 , rank deficiency = 28 , for 33 Xs.
Variable to drop: FTSE_Volume_sqar
Adjusted R² = 0.9927697252000652 , max(X p-value) = 0.7204203630389256 , rank deficiency = 19 , for 32 Xs.
Variable to drop: FTSE_High_Price
Adjusted R² = 0.9931380275772231 , max(X p-value) = 0.8204399941897406 , rank deficiency = 18 , for 31 Xs.
Variable to drop: Rainfall_mm_sqar
Adjusted R² = 0.9931159946747666 , max(X p-value) = 0.7760742197335619 , rank deficiency = 18 , for 30 Xs.
Variable to drop: Max_Temperature_DegC_sqar
Adjusted R² = 0.9932126618747562 , max(X p-value) = 0.9888613777837427 , rank deficiency = 18 , for 29 Xs.
Variable to drop: Min_Temperature_DegC
Adjusted R² = 0.9932330127081226 , max(X p-value) = 0.9741794189239255 , rank deficiency = 17 , for 28 Xs.
Variable to drop: Min_Temperature_DegC_sqar
Adjusted R² = 0.9924005797407535 , max(X p-value) = 0.7460978702217127 , rank deficiency = 16 , for 27 Xs.
Variable to drop: Unnamed__0_sqar
Adjusted R² = 0.993329535996961 , max(X p-value) = 0.6035926731095812 , rank deficiency = 16 , for 26 Xs.
Variable to drop: FTSE_Open_Price
Adjusted R² = 0.9933387650872868 , max(X p-value) = 0.14398956469623952 , rank deficiency = 16 , for 25 Xs.
Variable to drop: FTSE_High_Price_sqar
Adjusted R² = 0.9934605156544192 , max(X p-value) = 0.3820228006915348 , rank deficiency = 16 , for 24 Xs.
Variable to drop: cumCasesByPublishDate_sqrt
Adjusted R² = 0.9929343040310185 , max(X p-value) = 0.6648212615623568 , rank deficiency = 16 , for 23 Xs.
Variable to drop: cumDailyNsoDeathsByDeathDate_sqrt
Adjusted R² = 0.9934190055744618 , max(X p-value) = 0.6902162438139163 , rank deficiency = 15 , for 22 Xs.
Variable to drop: Sun_Hours
Adjusted R² = 0.9934831128429907 , max(X p-value) = 0.8638430454200229 , rank deficiency = 14 , for 21 Xs.
Variable to drop: Rainfall_mm
Adjusted R² = 0.9934943454953131 , max(X p-value) = 0.5338233455915548 , rank deficiency = 13 , for 20 Xs.
Variable to drop: Unnamed__0_sqrt
Adjusted R² = 0.9934811904207127 , max(X p-value) = 0.32585069696809665 , rank deficiency = 12 , for 19 Xs.
Variable to drop: Unnamed__0
Adjusted R² = 0.9932161845767776 , max(X p-value) = 0.8174400458926784 , rank deficiency = 11 , for 18 Xs.
Variable to drop: cumPeopleVaccinatedCompleteByPublishDate_sqrt
Adjusted R² = 0.9932347672545414 , max(X p-value) = 0.25294991311910264 , rank deficiency = 11 , for 17 Xs.
Variable to drop: cumAdmissions
Adjusted R² = 0.9932402807763137 , max(X p-value) = 0.00034782495726692897 , rank deficiency = 11 , for 16 Xs.
Variable to drop: cumCasesByPublishDate
Adjusted R² = 0.9930776121862502 , max(X p-value) = 0.0211799524042513 , rank deficiency = 10 , for 15 Xs.
Variable to drop: cumVirusTests_sqar
Adjusted R² = 0.9933930706265583 , max(X p-value) = 0.5284879501741218 , rank deficiency = 0 , for 14 Xs.
Variable to drop: Sun_Hours_sqrt
Adjusted R² = 0.9934048704581147 , max(X p-value) = 0.5843767842628067 , rank deficiency = 0 , for 13 Xs.
Variable to drop: Max_Temperature_DegC
Adjusted R² = 0.993418533115842 , max(X p-value) = 0.5203788233327193 , rank deficiency = 0 , for 12 Xs.
Variable to drop: Min_Temperature_DegC_sqrt
Adjusted R² = 0.9934299097865394 , max(X p-value) = 0.724109308600305 , rank deficiency = 0 , for 11 Xs.
Variable to drop: Rainfall_mm_sqrt
Adjusted R² = 0.9934468220450595 , max(X p-value) = 0.1258699008334402 , rank deficiency = 0 , for 10 Xs.
Variable to drop: FTSE_Low_Price
Adjusted R² = 0.9934207962140928 , max(X p-value) = 0.1311952036120294 , rank deficiency = 0 , for 9 Xs.
Variable to drop: FTSE_High_Price_sqrt
Adjusted R² = 0.9933959934558395 , max(X p-value) = 0.36900897193371773 , rank deficiency = 0 , for 8 Xs.
Variable to drop: FTSE_Open_Price_sqrt
Adjusted R² = 0.9933996683032327 , max(X p-value) = 0.09583175115830217 , rank deficiency = 0 , for 7 Xs.
Variable to drop: Max_Temperature_DegC_sqrt
Adjusted R² = 0.9933653446973529 , max(X p-value) = 0.07126909932971441 , rank deficiency = 0 , for 6 Xs.
Variable to drop: FTSE_Close_Price
Adjusted R² = 0.9933216183820517 , max(X p-value) = 0.1061665713594618 , rank deficiency = 0 , for 5 Xs.
Variable to drop: cumVirusTests
Adjusted R² = 0.9932902707374486 , max(X p-value) = 0.015764633163752196 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9931957761886889 , max(X p-value) = 1.1479826891071416e-09 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9924495478708683 , max(X p-value) = 0.6347129256625307 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9924662903673914 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Restarting from best model (with 4 Xs & Adjusted R² = 0.9932902707374486) found so far...
Adding 0 2-way interactions among 1 untransformed variables in best model found so far:
X pairs with correlations > 0.995 :
(no more)
Adjusted R² = 0.9932902707374486 , max(X p-value) = 0.015764633163752196 , rank deficiency = 0 , for 4 Xs.
Variable to drop: FTSE_Volume_sqrt
Adjusted R² = 0.9931957761886889 , max(X p-value) = 1.1479826891071416e-09 , rank deficiency = 0 , for 3 Xs.
Variable to drop: FTSE_Close_Price_sqrt
Adjusted R² = 0.9924495478708683 , max(X p-value) = 0.6347129256625307 , rank deficiency = 0 , for 2 Xs.
Variable to drop: FTSE_Low_Price_sqrt
Adjusted R² = 0.9924662903673914 , max(X p-value) = 0.0 , rank deficiency = 0 , for 1 Xs.
Variable left: SPY_Open_Price
Best model has 4 Xs (Adjusted R² = 0.9932902707374486 , rank deficiency = 0):
Results: Ordinary least squares
=======================================================================
Model: OLS Adj. R-squared: 0.993
Dependent Variable: SPY_Close_Price AIC: 1836.4673
Date: 2021-04-22 13:48 BIC: 1855.7712
No. Observations: 351 Log-Likelihood: -913.23
Df Model: 4 F-statistic: 1.295e+04
Df Residuals: 346 Prob (F-statistic): 0.00
R-squared: 0.993 Scale: 10.806
-----------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
-----------------------------------------------------------------------
Intercept -4.2118 5.2625 -0.8003 0.4241 -14.5624 6.1388
SPY_Open_Price 1.0070 0.0061 166.2750 0.0000 0.9951 1.0189
FTSE_Low_Price_sqrt -3.4388 0.5173 -6.6478 0.0000 -4.4563 -2.4214
FTSE_Close_Price_sqrt 3.4778 0.5255 6.6178 0.0000 2.4442 4.5114
FTSE_Volume_sqrt -0.0001 0.0000 -2.4263 0.0158 -0.0002 -0.0000
-----------------------------------------------------------------------
Omnibus: 25.063 Durbin-Watson: 2.018
Prob(Omnibus): 0.000 Jarque-Bera (JB): 34.477
Skew: -0.533 Prob(JB): 0.000
Kurtosis: 4.105 Condition No.: 911529
=======================================================================
* The condition number is large (9e+05). This might indicate
strong multicollinearity or other numerical problems.
Descending order of 4 X's significance, assuming Normal error distribution:
Coefficient z-stat
Intercept -4.211777 -0.800332
SPY_Open_Price 1.006991 166.274951
FTSE_Low_Price_sqrt -3.438845 -6.647758
FTSE_Close_Price_sqrt 3.477797 6.617810
FTSE_Volume_sqrt -0.000085 -2.426288
Rank deficiency = 0: Df Model (4) is same as number of Xs (4).
Partial Leverage (or Partial Regression, or Added-Variable) diagnostic plots for fit:
Normal fit using reproducible random 80% (x_train & y_train) of data rows:
Mean Absolute Residual = 2.437777470045315
Root Mean Squared Residual = 3.2637168743878178
R² = 0.9933669533575931
Normal prediction using remaining 20% (x_test & y_test) of data rows:
Mean Absolute Error = 2.278512896432442
Root Mean Squared Error = 3.282785931543249
R² = 0.9922985759558204
Plots of train-set fit & test-set predict:
# Random Forest and OLS (with variable selection) Regressions
#https://data-flair.training/blogs/train-test-set-in-python-ml
import time
stm = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.simplefilter('ignore')
import os
try:
os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data") #change "\0_Teach\data" accordingly
except:
try:
os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data") #for Mac
except:
#assume data file in current folder
pass
#read data and delete non-numeric columns
df_clean = pd.read_csv('cleaned_data.csv', index_col=0) #specifying 1st column as row labels (called 'index')
yname = 'FTSE_Close_Price'
# x = d.drop(['random_index','resale_price','town','flat_model','flat_type','block_letter','street_name','street_name_root',
# 'street_name_type','street_name_number','street_name_type_begin','no_of_rooms'], 1)
x = df_clean[set(df_clean.columns) - set([yname])].select_dtypes(include='number').dropna(axis=1)
y = df_clean[yname]
print("Regress '" + y.name + "' on", list(x),'\n')
#split into training & testing sets
from sklearn.model_selection import train_test_split
#x_train,x_test,y_train,y_test = train_test_split(x, y, train_size=0.7, test_size=0.3, random_state=123)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
#fit random forest model on training set
from sklearn.ensemble import RandomForestRegressor
print('Random Forest regression may take time...\n')
part = .1
model = RandomForestRegressor(n_jobs=-1, min_samples_leaf=part) #for displaying tree
model.fit(x_train, y_train)
xnames = list(x_train)
print('Each end node will have at least', int(len(x_train) * part), 'observations.\n')
import pydot
from IPython.display import Image
from sklearn import tree
rint = np.random.randint(len(model.estimators_))
print('Randomly selected #' + str(rint + 1), 'of', len(model.estimators_), 'trees in the Forest:')
display(Image(pydot.graph_from_dot_data(tree.export_graphviz(model.estimators_[rint],
feature_names=xnames, class_names=[str(a) for a in sorted(list(df_clean[yname].unique()))],
filled=True, rounded=True, special_characters=True))[0].create_png()))
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor.score
print('Coefficient-of-Determination of fit (could be better if tree display not needed) =', model.score(df_clean[xnames], df_clean[yname]), '\n')
print('\nRefitting properly after displaying tree ...\n')
model = RandomForestRegressor(n_jobs=-1).fit(x_train, y_train)
#predict using test set
fitRF = model.predict(x_test)
from sklearn.metrics import mean_absolute_error
print('Random forest validation MAE =', mean_absolute_error(y_test, fitRF))
from sklearn.metrics import mean_squared_error
print('Random forest validation RMSE =', np.sqrt(mean_squared_error(y_test, fitRF)))
print('Random forest validation R² =',
pd.concat([y_test, pd.Series(fitRF, index=y_test.index)], 1).corr().iloc[0, 1] ** 2)
#plot y vs y-hat
pl.rcParams["figure.figsize"] = [6.000, 6.143] #square plot
#plot pseudo regression line
s, i = np.polyfit(fitRF, y_test, 1) #s=slope, i=intercept
a, b = min(fitRF), max(fitRF)
pl.plot([a, b], i + [s * a, s * b], 'red', linewidth=0.4) #increase linewidth for darker line
del s, i, a, b
#scatter y vs y-hat
pl.scatter(fitRF, y_test, s=1, linewidths=0)
xn = 'random forest fitRF'
yn = y_test.name
pl.xlabel(xn)
pl.ylabel(yn)
pl.xticks(rotation=90)
pl.title("'" + yn + "' vs '" + xn + "' for " + str(len(y_test)) + ' obs')
pl.show()
print('\nRandom Forest regression for', len(x_train), 'rows and', x_train.shape[1], 'columns, and prediction for',
len(x_test), 'rows took', '%.2f' % ((time.time() - stm) / 60), 'mins.')
#ols variable selection
stm = time.time()
print('\n\nOLS with variable selection.')
df = pd.concat([y_train, x_train], axis=1)
#model number-1, i.e. best; keep updating this:
#m1 = mod; #wrong if rerun in cell
m1 = ' + '.join(list(df)).replace('+', '~', 1)
print('\nFull Model:', m1)
import statsmodels.formula.api as sm
out = sm.ols(m1, df).fit()
#r² number-1, i.e. best
#r1 = out.rsquared_adj; #wrong if rerun in cell
r1 = out.rsquared_adj
print('Adj R² =', r1, '\n')
xs = list(df) #list of x variables, keep updating this
while len(xs) > 2: #(y and last x) make 2 variables
tabs = abs(out.tvalues[1:]) #omit intercept at position 1
xs.remove(tabs[tabs == min(tabs)].index[0]) #remove name of variable with smallest |t|
mod = ' + '.join(xs).replace('+', '~', 1) #rebuild model equation
print(mod)
out = sm.ols(mod, df).fit()
radj = out.rsquared_adj
print('Adj R² =', radj, '\n')
if radj > r1: #found model with larger Adjusted R²
r1 = radj #update best Adjusted R²
m1 = mod #update best model equation
print('Best Model:', m1)
print('Best Adj R² =', r1, '\n')
fitOLS = sm.ols(m1, df).fit().predict(x_test)
#print(sm.ols(m1, df).fit().summary2())
print('OLS validation MAE =', mean_absolute_error(y_test, fitOLS))
print('OLS validation RMSE =', np.sqrt(mean_squared_error(y_test, fitOLS)))
print('OLS validation R² =', pd.concat([y_test, fitOLS], 1).corr().iloc[0, 1] ** 2)
#plot pseudo regression line
s, i = np.polyfit(fitOLS, y_test, 1) #s=slope, i=intercept
a, b = min(fitOLS), max(fitOLS)
pl.plot([a, b], i + [s * a, s * b], 'red', linewidth=0.4) #increase linewidth for darker line
del s, i, a, b
#scatter y vs y-hat
pl.scatter(fitOLS, y_test, s=1, linewidths=0)
xn = 'ols fitOLS'
yn = y_test.name
pl.xlabel(xn)
pl.ylabel(yn)
pl.xticks(rotation=90)
pl.title("'" + yn + "' vs '" + xn + "' for " + str(len(y_test)) + ' obs')
pl.show()
print('\nOLS variable selection for', len(x_train), 'rows and', x_train.shape[1], 'columns, and prediction for',
len(x_test), 'rows took', '%.2f' % ((time.time() - stm) / 60), 'mins.')
Regress 'FTSE_Close_Price' on ['Min_Temperature_DegC', 'FTSE_Open_Price', 'Max_Temperature_DegC', 'US_Covid_Cases', 'SPY_Open_Price', 'US_Covid_Deaths', 'FTSE_Volume', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'FTSE_Low_Price', 'Rainfall_mm', 'SPY_Close_Price', 'cumPeopleVaccinatedCompleteByPublishDate', 'cumCasesByPublishDate', 'Sun_Hours', 'cumVirusTests', 'FTSE_High_Price'] Random Forest regression may take time... Each end node will have at least 30 observations. Randomly selected #94 of 100 trees in the Forest:
Coefficient-of-Determination of fit (could be better if tree display not needed) = 0.9076446521359514 Refitting properly after displaying tree ... Random forest validation MAE = 33.22239772727328 Random forest validation RMSE = 58.0380914219514 Random forest validation R² = 0.9825304539800646
Random Forest regression for 307 rows and 17 columns, and prediction for 132 rows took 0.03 mins. OLS with variable selection. Full Model: FTSE_Close_Price ~ Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + Rainfall_mm + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Adj R² = 0.9945830997608046 FTSE_Close_Price ~ Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + FTSE_Volume + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + Rainfall_mm + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Adj R² = 0.9946016609453648 FTSE_Close_Price ~ Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + FTSE_Volume + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + Rainfall_mm + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Adj R² = 0.9946196305697863 FTSE_Close_Price ~ Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + Rainfall_mm + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Adj R² = 0.994636685247911 FTSE_Close_Price ~ Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Adj R² = 0.994636833228185 FTSE_Close_Price ~ Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumVirusTests + FTSE_High_Price Adj R² = 0.9946257570603207 FTSE_Close_Price ~ FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumVirusTests + FTSE_High_Price Adj R² = 0.9946035992458004 FTSE_Close_Price ~ FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + cumAdmissions + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumVirusTests + FTSE_High_Price Adj R² = 0.9945717810924996 FTSE_Close_Price ~ FTSE_Open_Price + US_Covid_Cases + SPY_Open_Price + cumAdmissions + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumVirusTests + FTSE_High_Price Adj R² = 0.9945362280583909 FTSE_Close_Price ~ FTSE_Open_Price + US_Covid_Cases + SPY_Open_Price + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumVirusTests + FTSE_High_Price Adj R² = 0.9944986594852554 FTSE_Close_Price ~ FTSE_Open_Price + SPY_Open_Price + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + cumVirusTests + FTSE_High_Price Adj R² = 0.9944664157384357 FTSE_Close_Price ~ FTSE_Open_Price + SPY_Open_Price + FTSE_Low_Price + SPY_Close_Price + cumVirusTests + FTSE_High_Price Adj R² = 0.9944529512101293 FTSE_Close_Price ~ FTSE_Open_Price + SPY_Open_Price + FTSE_Low_Price + SPY_Close_Price + FTSE_High_Price Adj R² = 0.9944443792051609 FTSE_Close_Price ~ FTSE_Open_Price + SPY_Open_Price + FTSE_Low_Price + FTSE_High_Price Adj R² = 0.9936475998442735 FTSE_Close_Price ~ FTSE_Open_Price + FTSE_Low_Price + FTSE_High_Price Adj R² = 0.9936548267485216 FTSE_Close_Price ~ FTSE_Low_Price + FTSE_High_Price Adj R² = 0.9917514072261262 FTSE_Close_Price ~ FTSE_Low_Price Adj R² = 0.9899506667666996 Best Model: FTSE_Close_Price ~ Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + SPY_Close_Price + cumPeopleVaccinatedCompleteByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Best Adj R² = 0.994636833228185 OLS validation MAE = 26.89656103846762 OLS validation RMSE = 38.50149639045949 OLS validation R² = 0.9921260225554243
OLS variable selection for 307 rows and 17 columns, and prediction for 132 rows took 0.02 mins.
# Random Forest and OLS (with variable selection) Regressions
#https://data-flair.training/blogs/train-test-set-in-python-ml
import time
stm = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.simplefilter('ignore')
import os
try:
os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data") #change "\0_Teach\data" accordingly
except:
try:
os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data") #for Mac
except:
#assume data file in current folder
pass
#read data and delete non-numeric columns
df_clean = pd.read_csv('cleaned_data.csv', index_col=0) #specifying 1st column as row labels (called 'index')
yname = 'SPY_Close_Price'
# x = d.drop(['random_index','resale_price','town','flat_model','flat_type','block_letter','street_name','street_name_root',
# 'street_name_type','street_name_number','street_name_type_begin','no_of_rooms'], 1)
x = df_clean[set(df_clean.columns) - set([yname])].select_dtypes(include='number').dropna(axis=1)
y = df_clean[yname]
print("Regress '" + y.name + "' on", list(x),'\n')
#split into training & testing sets
from sklearn.model_selection import train_test_split
#x_train,x_test,y_train,y_test = train_test_split(x, y, train_size=0.7, test_size=0.3, random_state=123)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
#fit random forest model on training set
from sklearn.ensemble import RandomForestRegressor
print('Random Forest regression may take time...\n')
part = .1
model = RandomForestRegressor(n_jobs=-1, min_samples_leaf=part) #for displaying tree
model.fit(x_train, y_train)
xnames = list(x_train)
print('Each end node will have at least', int(len(x_train) * part), 'observations.\n')
import pydot
from IPython.display import Image
from sklearn import tree
rint = np.random.randint(len(model.estimators_))
print('Randomly selected #' + str(rint + 1), 'of', len(model.estimators_), 'trees in the Forest:')
display(Image(pydot.graph_from_dot_data(tree.export_graphviz(model.estimators_[rint],
feature_names=xnames, class_names=[str(a) for a in sorted(list(df_clean[yname].unique()))],
filled=True, rounded=True, special_characters=True))[0].create_png()))
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor.score
print('Coefficient-of-Determination of fit (could be better if tree display not needed) =', model.score(df_clean[xnames], df_clean[yname]), '\n')
print('\nRefitting properly after displaying tree ...\n')
model = RandomForestRegressor(n_jobs=-1).fit(x_train, y_train)
#predict using test set
fitRF = model.predict(x_test)
from sklearn.metrics import mean_absolute_error
print('Random forest validation MAE =', mean_absolute_error(y_test, fitRF))
from sklearn.metrics import mean_squared_error
print('Random forest validation RMSE =', np.sqrt(mean_squared_error(y_test, fitRF)))
print('Random forest validation R² =',
pd.concat([y_test, pd.Series(fitRF, index=y_test.index)], 1).corr().iloc[0, 1] ** 2)
#plot y vs y-hat
pl.rcParams["figure.figsize"] = [6.000, 6.143] #square plot
#plot pseudo regression line
s, i = np.polyfit(fitRF, y_test, 1) #s=slope, i=intercept
a, b = min(fitRF), max(fitRF)
pl.plot([a, b], i + [s * a, s * b], 'red', linewidth=0.4) #increase linewidth for darker line
del s, i, a, b
#scatter y vs y-hat
pl.scatter(fitRF, y_test, s=1, linewidths=0)
xn = 'random forest fitRF'
yn = y_test.name
pl.xlabel(xn)
pl.ylabel(yn)
pl.xticks(rotation=90)
pl.title("'" + yn + "' vs '" + xn + "' for " + str(len(y_test)) + ' obs')
pl.show()
print('\nRandom Forest regression for', len(x_train), 'rows and', x_train.shape[1], 'columns, and prediction for',
len(x_test), 'rows took', '%.2f' % ((time.time() - stm) / 60), 'mins.')
#ols variable selection
stm = time.time()
print('\n\nOLS with variable selection.')
df = pd.concat([y_train, x_train], axis=1)
#model number-1, i.e. best; keep updating this:
#m1 = mod; #wrong if rerun in cell
m1 = ' + '.join(list(df)).replace('+', '~', 1)
print('\nFull Model:', m1)
import statsmodels.formula.api as sm
out = sm.ols(m1, df).fit()
#r² number-1, i.e. best
#r1 = out.rsquared_adj; #wrong if rerun in cell
r1 = out.rsquared_adj
print('Adj R² =', r1, '\n')
xs = list(df) #list of x variables, keep updating this
while len(xs) > 2: #(y and last x) make 2 variables
tabs = abs(out.tvalues[1:]) #omit intercept at position 1
xs.remove(tabs[tabs == min(tabs)].index[0]) #remove name of variable with smallest |t|
mod = ' + '.join(xs).replace('+', '~', 1) #rebuild model equation
print(mod)
out = sm.ols(mod, df).fit()
radj = out.rsquared_adj
print('Adj R² =', radj, '\n')
if radj > r1: #found model with larger Adjusted R²
r1 = radj #update best Adjusted R²
m1 = mod #update best model equation
print('Best Model:', m1)
print('Best Adj R² =', r1, '\n')
fitOLS = sm.ols(m1, df).fit().predict(x_test)
#print(sm.ols(m1, df).fit().summary2())
print('OLS validation MAE =', mean_absolute_error(y_test, fitOLS))
print('OLS validation RMSE =', np.sqrt(mean_squared_error(y_test, fitOLS)))
print('OLS validation R² =', pd.concat([y_test, fitOLS], 1).corr().iloc[0, 1] ** 2)
#plot pseudo regression line
s, i = np.polyfit(fitOLS, y_test, 1) #s=slope, i=intercept
a, b = min(fitOLS), max(fitOLS)
pl.plot([a, b], i + [s * a, s * b], 'red', linewidth=0.4) #increase linewidth for darker line
del s, i, a, b
#scatter y vs y-hat
pl.scatter(fitOLS, y_test, s=1, linewidths=0)
xn = 'ols fitOLS'
yn = y_test.name
pl.xlabel(xn)
pl.ylabel(yn)
pl.xticks(rotation=90)
pl.title("'" + yn + "' vs '" + xn + "' for " + str(len(y_test)) + ' obs')
pl.show()
print('\nOLS variable selection for', len(x_train), 'rows and', x_train.shape[1], 'columns, and prediction for',
len(x_test), 'rows took', '%.2f' % ((time.time() - stm) / 60), 'mins.')
Regress 'SPY_Close_Price' on ['FTSE_Close_Price', 'Min_Temperature_DegC', 'FTSE_Open_Price', 'Max_Temperature_DegC', 'US_Covid_Cases', 'SPY_Open_Price', 'US_Covid_Deaths', 'FTSE_Volume', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'FTSE_Low_Price', 'Rainfall_mm', 'cumPeopleVaccinatedCompleteByPublishDate', 'cumCasesByPublishDate', 'Sun_Hours', 'cumVirusTests', 'FTSE_High_Price'] Random Forest regression may take time... Each end node will have at least 30 observations. Randomly selected #99 of 100 trees in the Forest:
Coefficient-of-Determination of fit (could be better if tree display not needed) = 0.9450666620934312 Refitting properly after displaying tree ... Random forest validation MAE = 2.22897158628792 Random forest validation RMSE = 3.255786663144385 Random forest validation R² = 0.9931819000984742
Random Forest regression for 307 rows and 17 columns, and prediction for 132 rows took 0.03 mins. OLS with variable selection. Full Model: SPY_Close_Price ~ FTSE_Close_Price + Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + Rainfall_mm + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Adj R² = 0.9935447264186372 SPY_Close_Price ~ FTSE_Close_Price + Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + Sun_Hours + cumVirusTests + FTSE_High_Price Adj R² = 0.9935666776113403 SPY_Close_Price ~ FTSE_Close_Price + Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + cumVirusTests + FTSE_High_Price Adj R² = 0.9935882037706294 SPY_Close_Price ~ FTSE_Close_Price + Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + cumDailyNsoDeathsByDeathDate + cumAdmissions + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9936093987630036 SPY_Close_Price ~ FTSE_Close_Price + Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + cumDailyNsoDeathsByDeathDate + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9936250543848442 SPY_Close_Price ~ FTSE_Close_Price + Min_Temperature_DegC + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9936459126851079 SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + US_Covid_Deaths + FTSE_Volume + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9936569430721897 SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + FTSE_Volume + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9936639685857038 SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9936395309615964 SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + US_Covid_Cases + SPY_Open_Price + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9935965673034789 SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + US_Covid_Cases + SPY_Open_Price + FTSE_Low_Price + cumCasesByPublishDate + FTSE_High_Price Adj R² = 0.9935472201002022 SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + US_Covid_Cases + SPY_Open_Price + FTSE_Low_Price + FTSE_High_Price Adj R² = 0.9934594204068847 SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + SPY_Open_Price + FTSE_Low_Price + FTSE_High_Price Adj R² = 0.9933825502621892 SPY_Close_Price ~ FTSE_Close_Price + SPY_Open_Price + FTSE_Low_Price + FTSE_High_Price Adj R² = 0.9928402977289764 SPY_Close_Price ~ FTSE_Close_Price + SPY_Open_Price + FTSE_Low_Price Adj R² = 0.9927626193886742 SPY_Close_Price ~ SPY_Open_Price + FTSE_Low_Price Adj R² = 0.9924188869763122 SPY_Close_Price ~ SPY_Open_Price Adj R² = 0.9924330994070361 Best Model: SPY_Close_Price ~ FTSE_Close_Price + FTSE_Open_Price + Max_Temperature_DegC + US_Covid_Cases + SPY_Open_Price + FTSE_Volume + FTSE_Low_Price + cumPeopleVaccinatedCompleteByPublishDate + cumCasesByPublishDate + FTSE_High_Price Best Adj R² = 0.9936639685857038 OLS validation MAE = 2.517444842724377 OLS validation RMSE = 3.3760602183832775 OLS validation R² = 0.9926397923559892
OLS variable selection for 307 rows and 17 columns, and prediction for 132 rows took 0.01 mins.
#Decision Tree Boosting explanation
#Boosting corrects the mistakes of previous learners by fitting patterns in residuals
#https://towardsdatascience.com/gradient-boosting-is-one-of-the-most-effective-ml-techniques-out-there-af6bfd0df342
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))
display(HTML('<style>div.output_scroll { height: 31em; }</style>'))
import numpy as np
columns = ['cumCasesByPublishDate', 'cumVirusTests',
'cumDailyNsoDeathsByDeathDate',
'cumPeopleVaccinatedCompleteByPublishDate', 'cumAdmissions',
'FTSE_Open_Price', 'FTSE_High_Price',
'FTSE_Low_Price', 'FTSE_Volume', 'Max_Temperature_DegC',
'Min_Temperature_DegC', 'Rainfall_mm', 'Sun_Hours', 'US_Covid_Cases',
'US_Covid_Deaths', 'SPY_Open_Price']
variables = ['FTSE_Close_Price', 'SPY_Close_Price']
df_clean = pd.read_csv('cleaned_data.csv', index_col=0)
for variable in variables:
for column in columns:
#X is evenly spaced between 0 & 10:
X = df_clean[column]
X = X[:, np.newaxis]
y = df_clean[variable]
#change X from 1 to 2 dimensional:
Xnames = ['{0}'.format(column)]
#plot y vs X:
plt.figure(figsize=(15, 5))
plt.title('{0} vs {1}'.format(variable, column), fontsize=20)
plt.xlabel('{0}'.format(column))
plt.ylabel('{0}'.format(variable));
plt.scatter(X, y, alpha=.7);
plt.show()
def many_trees(n_trees, **kwargs):
#returns a list of n_trees 'empty' decision trees
#**kwargs allows passing of any number of keyword arguments
trees = [DecisionTreeRegressor(**kwargs) for i in range(n_trees)]
return trees
def learners_for_residuals(trees, X, y):
#trains the decision trees sequentially
# first fitting the target outcome y with tree_0
# then from tree_1 to tree_(n-1) fits the residuals
boosters = []
for tree in trees:
tree.fit(X, y)
yhat = tree.predict(X)
#set next y as residual of current y
y = y - yhat
boosters.append(tree)
return boosters
def many_fits(trees, X):
#iterates through the list of learner decision trees and returns each tree's fit for common X
#each learner has its own y (different stages of residuals)
return np.array([tree.predict(X) for tree in trees]).T
#make the learners extremely week by setting the max-depth of each tree to 1
# limits each tree to one split of X when fitting y:
learners = many_trees(30, max_depth=1)
#more generally, could also limit number of Xs (by regularization)
#fits the decision trees sequentially:
boosters = learners_for_residuals(learners, X, y)
boosted_yhat = many_fits(boosters, X)
#plot 1st tree:
print('First of', len(boosters) , 'decision trees of 1-deep:')
from sklearn import tree
import pydot
from IPython.display import Image
display(Image(pydot.graph_from_dot_data(tree.export_graphviz(boosters[0], feature_names=Xnames,
filled=True, rounded=True, special_characters=True))[0].create_png()))
def plot_fits(n_trees, plt_row, X, boosted_yhat):
#sums the fits of n trees and returns plots of the fitted line and the residuals
boosted_fit = boosted_yhat[:, :n_trees].sum(1)
res = boosted_fit - y
axes[plt_row, 0].scatter(X, y, alpha=0.7)
axes[plt_row, 0].plot(X, boosted_fit, c='red', alpha=0.7)
axes[plt_row, 0].set_title(f'Fit after {n_trees} trees', fontsize=20)
axes[plt_row, 1].scatter(X, res, alpha=0.7)
axes[plt_row, 1].plot(X, res, color='r', alpha=0.7)
axes[plt_row, 1].set_title(f'Residuals after {n_trees} trees', fontsize=20)
#plot boosting progress:
fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(20,30))
plot_fits(1, 0, X, boosted_yhat)
plot_fits(5, 1, X, boosted_yhat)
plot_fits(10, 2, X, boosted_yhat)
plot_fits(15, 3, X, boosted_yhat)
plot_fits(20, 4, X, boosted_yhat)
plot_fits(25, 5, X, boosted_yhat)
plot_fits(30, 6, X, boosted_yhat)
#the line appears to be a decent fit for the data, and residuals finally looked quite random
fig.tight_layout()
plt.show()
#plot successive fits together:
plt.figure(figsize=(15, 5))
sumfits = 0
for i in range(len(learners)):
#plot each fit across the data by adding each succesive set of fits and plotting the line
sumfits += boosted_yhat[:, i]
plt.plot(X, sumfits)
#add final fit in red:
plt.plot(X, many_fits(learners, X).sum(1), c='red', alpha=1)
plt.title('Successive Fits', fontsize=20)
plt.xlabel('{0}'.format(column))
plt.ylabel('Fit for {0}'.format(variable));
plt.scatter(X, y, alpha=.4)
plt.show()
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
First of 30 decision trees of 1-deep:
#K-Nearest Neighbor Regression
#import
import os
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>")) #set height of output window below
#plt.rcParams['figure.figsize'] = 18, 8 #width & height for time-series plot
plt.rcParams['figure.figsize'] = 6.000, 6.000
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['lines.markersize'] = 1
df_clean = pd.read_csv('cleaned_data.csv', index_col=0) #read csv file
df_clean["date"] = pd.to_numeric(df_clean["date"], errors="coerce", downcast="integer")
df_knnr = df_clean.copy()
y=df_knnr.pop('FTSE_Close_Price')
df_knnr.insert(0,'FTSE_Close_Price',y)
X = df_knnr.iloc[:,2:]
#Split into random train and test subsets, for cross validation:
X_train, X_test, y_train, y_test = train_test_split(X, y)
mod = KNeighborsRegressor(11, weights='distance') # k=11 neighbors; weights are inverse of distances
mod.fit(X_train, y_train)
y_pred = mod.predict(X_test)
print('\nBelow numbers change for each run:') #because test dataset randomly selected
#unlike OLS, this R² is not corr(y_pred, y_test)**2 :
print('\nAccuracy of fit , R² =', mod.score(X_train, y_train)) #must have called .fit()
print('Accuracy for forecast, R² =', mod.score(X_test, y_test), '\n')
print('Forecast root mean squared error =', mean_squared_error(y_pred, y_test,squared=False))
s, i = np.polyfit(y_pred, y_test, 1) #s=slope, i=intercept; for best-fit 1-degree polynomial
a, b = min(y_pred), max(y_pred)
#line drawn below is theoretically misleading when used for multiple regression,
# as it doesn't represent the fitting process:
#plot 'best-fit' line; increase linewidth for darker line:
plt.plot([a, b], i + [s * a, s * b], color='red', linewidth=0.5)
del s, i, a, b
plt.scatter(y_pred, y_test)
xname = 'Forecasted ' + y.name
plt.xlabel(xname)
plt.ylabel(y.name)
plt.title('Corr(' + xname + ', ' + y.name + ') = ' + str(round(pd.concat([y_test,
pd.Series(y_pred, index=y_test.index)], 1).corr().iloc[0, 1], 4)))
plt.show()
Below numbers change for each run: Accuracy of fit , R² = 1.0 Accuracy for forecast, R² = 0.778683198771275 Forecast root mean squared error = 223.71722960756247
#K-Nearest Neighbor Regression
#import
import os
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>")) #set height of output window below
#plt.rcParams['figure.figsize'] = 18, 8 #width & height for time-series plot
plt.rcParams['figure.figsize'] = 6.000, 6.000
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['lines.markersize'] = 1
df_clean = pd.read_csv('cleaned_data.csv', index_col=0) #read csv file
df_clean["date"] = pd.to_numeric(df_clean["date"], errors="coerce", downcast="integer")
df_knnr = df_clean.copy()
y=df_knnr.pop('SPY_Close_Price')
df_knnr.insert(0,'SPY_Close_Price',y)
X = df_knnr.iloc[:,2:]
#Split into random train and test subsets, for cross validation:
X_train, X_test, y_train, y_test = train_test_split(X, y)
mod = KNeighborsRegressor(11, weights='distance') # k=11 neighbors; weights are inverse of distances
mod.fit(X_train, y_train)
y_pred = mod.predict(X_test)
print('\nBelow numbers change for each run:') #because test dataset randomly selected
#unlike OLS, this R² is not corr(y_pred, y_test)**2 :
print('\nAccuracy of fit , R² =', mod.score(X_train, y_train)) #must have called .fit()
print('Accuracy for forecast, R² =', mod.score(X_test, y_test), '\n')
print('Forecast root mean squared error =', mean_squared_error(y_pred, y_test,squared=False))
s, i = np.polyfit(y_pred, y_test, 1) #s=slope, i=intercept; for best-fit 1-degree polynomial
a, b = min(y_pred), max(y_pred)
#line drawn below is theoretically misleading when used for multiple regression,
# as it doesn't represent the fitting process:
#plot 'best-fit' line; increase linewidth for darker line:
plt.plot([a, b], i + [s * a, s * b], color='red', linewidth=0.5)
del s, i, a, b
plt.scatter(y_pred, y_test)
xname = 'Forecasted ' + y.name
plt.xlabel(xname)
plt.ylabel(y.name)
plt.title('Corr(' + xname + ', ' + y.name + ') = ' + str(round(pd.concat([y_test,
pd.Series(y_pred, index=y_test.index)], 1).corr().iloc[0, 1], 4)))
plt.show()
Below numbers change for each run: Accuracy of fit , R² = 1.0 Accuracy for forecast, R² = 0.8703707132600846 Forecast root mean squared error = 14.862118730954258
## import os
import math
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from scipy import stats as st
from scipy.stats import linregress
import scipy.optimize as optimize
import scipy.sparse as sparse
import scipy.special as special
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
import warnings
warnings.filterwarnings('ignore') #suppress warning messages (those with peach background)
import statsmodels.api as sm
from statsmodels.formula.api import ols
#read csv file as a pandas dataframe
df_clean = pd.read_csv('cleaned_data.csv', index_col=None) #read csv file
print('Imported Dataset from cleaned_data.csv:')
df_clean #show dataframe
Imported Dataset from cleaned_data.csv:
| Unnamed: 0 | date | cumCasesByPublishDate | cumVirusTests | cumDailyNsoDeathsByDeathDate | cumPeopleVaccinatedCompleteByPublishDate | cumAdmissions | FTSE_Open_Price | FTSE_Close_Price | FTSE_High_Price | FTSE_Low_Price | FTSE_Volume | Max_Temperature_DegC | Min_Temperature_DegC | Rainfall_mm | Sun_Hours | US_Covid_Cases | US_Covid_Deaths | SPY_Open_Price | SPY_Close_Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2020-01-21 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 0.000000 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.899994 | 331.299988 |
| 1 | 1 | 2020-01-22 | 0.0 | 5.913656e+03 | 0.0 | 1.102532e+03 | 78.612903 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 332.239990 | 331.339996 |
| 2 | 2 | 2020-01-23 | 0.0 | 1.182731e+04 | 0.0 | 2.205065e+03 | 157.225806 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 1.0 | 0.0 | 330.630005 | 331.720001 |
| 3 | 3 | 2020-01-24 | 0.0 | 1.774097e+04 | 0.0 | 3.307597e+03 | 235.838710 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 2.0 | 0.0 | 332.440002 | 328.769989 |
| 4 | 4 | 2020-01-25 | 0.0 | 2.365462e+04 | 0.0 | 4.410130e+03 | 314.451613 | 7483.57 | 7381.96 | 7483.57 | 7357.62 | 672044352.0 | 14.0 | 6.0 | 76.0 | 135.0 | 3.0 | 0.0 | 332.440002 | 328.769989 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | 434 | 2021-03-30 | 4341736.0 | 1.228242e+08 | 150116.0 | 4.125884e+06 | 457398.000000 | 6736.17 | 6772.12 | 6792.23 | 6729.46 | 654363968.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30416970.0 | 550500.0 | 394.420013 | 394.730011 |
| 435 | 435 | 2021-03-31 | 4345788.0 | 1.241472e+08 | 150116.0 | 4.513458e+06 | 457398.000000 | 6772.12 | 6713.63 | 6775.67 | 6713.63 | 837940608.0 | 12.1 | 2.8 | 37.0 | 116.3 | 30485232.0 | 551638.0 | 395.339996 | 396.329987 |
| 436 | 436 | 2021-04-01 | 4350266.0 | 1.241472e+08 | 150116.0 | 4.958874e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30562856.0 | 552593.0 | 398.399994 | 400.609985 |
| 437 | 437 | 2021-04-02 | 4353668.0 | 1.241472e+08 | 150116.0 | 5.205505e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
| 438 | 438 | 2021-04-03 | 4357091.0 | 1.241472e+08 | 150116.0 | 5.381745e+06 | 457398.000000 | 6713.63 | 6737.30 | 6766.52 | 6713.63 | 588526528.0 | 14.0 | 6.0 | 76.0 | 135.0 | 30631700.0 | 553554.0 | 398.399994 | 400.609985 |
439 rows × 20 columns
'''
Regression Regularization: balance optimization of fit and simplicity
https://www.pluralsight.com/guides/linear-lasso-ridge-regression-scikit-learn
https://fred.stlouisfed.org #federal reserve bank of st. louis
Linear Regression
Ridge Regression
Lasso Regression
Elastic Net Regression
'''
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
df_rl = df_clean.copy()
#set maximum window width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) #change code window width to max
pd.options.display.max_columns = 0 #change output window width to max
a = input('Positive number for regularization strength (i.e., alpha or lambda) [default: 1.0]? ').strip()
try:
a = float(a)
except:
a = 1.0
print(a, 'assumed.')
mr = input('\nElasticNet mixing parameter (0 ≤, ≤ 1) [default: 0.5]? ').strip()
try:
mr = float(mr)
if mr < 0 or mr > 1:
mr = 0.5
print(mr, 'assumed.')
else:
print(mr, 'entered.')
except:
mr = 0.5
print(mr, 'assumed.')
rs = input('\n1: Reproducible output (input any integer except 2), 2: Random train-test data split [default: 1]? ').strip()
if rs == '2':
rs = int(rs)
else:
try:
if rs == '':
rs = 1
else:
rs = int(rs)
print(rs, 'entered.')
except:
rs = 1 #can be changed to any integer for reproducible randomization
print(rs, 'assumed.')
print()
print(df_rl)
#print(df.shape)
#print(df.describe())
y=df_rl.pop('FTSE_Close_Price')
df_rl.insert(0,'FTSE_Close_Price',y)
target = ['FTSE_Close_Price']
df_rl = df_rl.select_dtypes(include='number').replace([float('inf'), float('-inf')], float('nan')).dropna(
0, subset=target).dropna(1)
print()
print(df_rl.drop(target[0], 1).corrwith(df_rl[target[0]]), '\n')
predictors = list(df_rl)
predictors=predictors[2:]
#predictors = list(set(list(df.columns)) - set(target)) #but ordering for columns can change
#scale the x variables by SIQR to make betas (coefficients) 'comparable' in penalty function (could instead use normalize=True but more affected by outliers):
from scipy.stats import iqr, norm
# df[predictors] = df[predictors] / df[predictors].max() #simpler scaling but not ideal
# df[predictors] = df[predictors].apply(lambda x: (x - x.median()) / iqr(x) * (norm.ppf(0.75) * 2)) #no need to center
print('predictors:',df_rl[predictors])
siqr = df_rl[predictors].apply(lambda x: iqr(x) / (norm.ppf(0.75) * 2)).rename('siqr')
#####
if (siqr == 0).any():
from statistics import stdev
siqr.loc[siqr.index[siqr == 0]] = df_rl[siqr.index[siqr == 0]].apply(lambda x: stdev(x))
df_rl[predictors] = df_rl[predictors] / siqr
#could instead have, e.g., Ridge(normalize=True)
print('\nXs scaled by SIQR:')
print(df_rl.describe())
X = df_rl[predictors].values
y = df_rl[target].values
#random state for train_test_split
# rs = 1 #any integer; =None for random
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs) #random sampling for each run
print('\nTrain set:', X_train.shape); print('Test set :', X_test.shape)
print('\n Pseudo R² = (correlation(y, (fit or prediction)))**2')
print('Coefficient of Determination = 1 - ((y-(fit or prediction)**2).sum()/((y-y.mean())**2).sum() = 1 - SSResidual/SSTotal [can be <0]\n')
#Linear Regression
print('\nOLS Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
from sklearn.linear_model import LinearRegression
lr = LinearRegression(n_jobs=-1) #use all CPU threads
print(lr.fit(X_train, y_train))
pred_train_lr = lr.predict(X_train)
#just 1 peep at train-set fit:
import matplotlib.pyplot as pl
%matplotlib inline
plt.rcParams["figure.figsize"] = (4.5, 4.5)
plt.scatter(pred_train_lr, y_train, s=3)
from sklearn.metrics import r2_score #Coefficient of Determination (can be negative), not (corr(y, y_fit))**2
def r2(y, x, cor=1):
if cor == 1:
#most reliable correlation; works even with nan
x = pd.DataFrame(x).squeeze() #x is now a Series
return (pd.concat([pd.DataFrame(y), x], 1).corr().iloc[0, 1] ** 2 if len(x.dropna().unique()) > 1 else 0)
else:
#coefficient of determination, which can be negative
return r2_score(y, x)
plt.title('y_train vs fit by x_train, R² = ' + str(round(r2(y_train, pred_train_lr), 3)))
plt.show()
# print(pd.DataFrame(lr.coef_[0] / siqr).rename(columns={0: 'OLS'}))
cfOLS = pd.Series(lr.coef_[0] / siqr, predictors).rename('OLS').to_frame()
cfOLS.index.name = "'coef'"
print(cfOLS)
print('\nTrain set:')
import numpy as np
from sklearn.metrics import mean_squared_error
print('OLS Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_lr)))
print('OLS Regression R² =', r2(y_train, pred_train_lr))
print('OLS Regression Coef of Determination =', r2(y_train, pred_train_lr, 0))
pred_test_lr = lr.predict(X_test)
print('\nTest set:')
print('OLS Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_lr)))
r2_ols = r2(y_test, pred_test_lr)
print('OLS Regression Pseudo R² =', r2_ols)
print('OLS Regression Coef of Determination =', r2(y_test, pred_test_lr, 0))
#alpha/lambda parameter for ridge, lasso & elastic net:
# a = 1.0 #any positive float
#Ridge Regression
print('\n\nRidge Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
from sklearn.linear_model import Ridge
#minimize ||y - Xw||^2_2 + alpha * ||w||^2_2
rr = Ridge(alpha=a, normalize=False, max_iter=10000) #increase max_iter to 10000 for comparison with later methods
print(rr.fit(X_train, y_train), '\n')
cfRidge = pd.Series(rr.coef_[0] / siqr, predictors).rename('Ridge').to_frame()
cfRidge.index.name = "'coef'"
print(cfRidge)
print('\nTrain set:')
pred_train_rr = rr.predict(X_train)
print('Ridge Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_rr)))
print('Ridge Regression Pseudo R² =', r2(y_train, pred_train_rr))
print('Ridge Regression Coef of Determination =', r2(y_train, pred_train_rr, 0))
print('\nTest set:')
pred_test_rr = rr.predict(X_test)
print('Ridge Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_rr)))
r2_ridge = r2(y_test, pred_test_rr)
print('Ridge Regression Pseudo R² =', r2_ridge)
print('Ridge Regression Coef of Determination =', r2(y_test, pred_test_rr, 0))
#Lasso Regression
print('\n\nLasso Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
from sklearn.linear_model import Lasso
#minimize (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
model_lasso = Lasso(alpha=a, normalize=False, max_iter=10000) #increase max_iter to 10000 for convergence
print(model_lasso.fit(X_train, y_train), '\n')
cfLasso = pd.Series(model_lasso.coef_ / siqr, predictors).rename('Lasso').to_frame()
cfLasso.index.name = "'coef'"
print(cfLasso)
print('\nTrain set:')
pred_train_lasso = model_lasso.predict(X_train)
print('Lasso Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_lasso)))
print('Lasso Regression Pseudo R² =', r2(y_train, pred_train_lasso))
print('Lasso Regression Coef of Determination =', r2(y_train, pred_train_lasso, 0))
print('\nTest set:')
pred_test_lasso = model_lasso.predict(X_test)
print('Lasso Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_lasso)))
r2_lasso = r2(y_test, pred_test_lasso)
print('Lasso Regression Pseudo R² =', r2_lasso)
print('Lasso Regression Coef of Determination =', r2(y_test, pred_test_lasso, 0))
#Elastic Net Regression
# mr = l1-norm/l2-norm ratio, [0, 1], default 0.5
print('\n\nElastic Net Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
from sklearn.linear_model import ElasticNet
#minimize 1 / (2 * n_samples) * ||y - Xw||^2_2 + alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2
#https://stackoverflow.com/questions/47365978/scikit-learn-elastic-net-approaching-ridge
#the weights for ridge & lasso sum to alpha and not 1
#l1_ratio=1 yeilds lasso, but l1_ratio=0 does not yield ridge
#increase max_iter to 10000 (default 1000) for convergence:
# model_enet = ElasticNet(alpha=a, max_iter=1000)
model_enet = ElasticNet(alpha=a, normalize=False, l1_ratio=mr, max_iter=10000, warm_start=True, selection='random')
print(model_enet.fit(X_train, y_train), '\n')
cfElNet = pd.Series(model_enet.coef_ / siqr, predictors).rename('El-Net').to_frame()
cfElNet.index.name = "'coef'"
print(cfElNet)
print('\nTrain set:')
pred_train_enet = model_enet.predict(X_train)
print('Elastic Net Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_enet)))
print('Elastic Net Regression Pseudo R² =', r2(y_train, pred_train_enet))
print('Elastic Net Regression Coef of Determination =', r2(y_train, pred_train_enet, 0))
print('\nTest set:')
pred_test_enet = model_enet.predict(X_test)
print('Elastic Net Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_enet)))
r2_enet = r2(y_test, pred_test_enet)
print('Elastic Net Regression Pseudo R² =', r2_enet)
print('Elastic Net Regression Coef of Determination =', r2(y_test, pred_test_enet, 0))
#compare coefs
print('\n\nTrain-set coefficients comparison:')
print(pd.concat([cfOLS, cfRidge, cfLasso, cfElNet], 1))
#compare r2s
print('\n\nTest-set Pseudo R² scores comparison:')
print('OLS =', r2_ols)
print('Ridge =', r2_ridge)
print('Lasso =', r2_lasso)
print('Elastic Net =', r2_enet)
#https://stackoverflow.com/questions/42818361/how-to-make-two-plots-side-by-side-using-python
plt.rcParams["figure.figsize"] = (20.2, 4.5)
# pl.tight_layout()
#plot y_test vs y_predict
print('\ny_test vs its prediction by x_test:')
plt.subplot(1, 4, 1) #1 row, 4 columns, plot 1
plt.title(' OLS, Pseudo R² = ' + str(round(r2_ols, 3)))
plt.scatter(pred_test_lr, y_test, s=3)
plt.subplot(1, 4, 2) #1 row, 4 columns, plot 2
plt.title(' Ridge, Pseudo R² = ' + str(round(r2_ridge, 3)))
plt.scatter(pred_test_rr, y_test, s=3)
plt.subplot(1, 4, 3) #1 row, 4 columns, plot 3
plt.title(' Lasso, Pseudo R² = ' + str(round(r2_lasso, 3)))
plt.scatter(pred_test_lasso, y_test, s=3)
plt.subplot(1, 4, 4) #1 row, 4 columns, plot 4
plt.title(' Elastic Net, Pseudo R² = ' + str(round(r2_enet, 3)))
plt.scatter(pred_test_enet, y_test, s=3)
plt.show()
Positive number for regularization strength (i.e., alpha or lambda) [default: 1.0]?
1.0 assumed.
ElasticNet mixing parameter (0 ≤, ≤ 1) [default: 0.5]?
0.5 assumed.
1: Reproducible output (input any integer except 2), 2: Random train-test data split [default: 1]?
1 entered.
Unnamed: 0 date ... SPY_Open_Price SPY_Close_Price
0 0 2020-01-21 ... 330.899994 331.299988
1 1 2020-01-22 ... 332.239990 331.339996
2 2 2020-01-23 ... 330.630005 331.720001
3 3 2020-01-24 ... 332.440002 328.769989
4 4 2020-01-25 ... 332.440002 328.769989
.. ... ... ... ... ...
434 434 2021-03-30 ... 394.420013 394.730011
435 435 2021-03-31 ... 395.339996 396.329987
436 436 2021-04-01 ... 398.399994 400.609985
437 437 2021-04-02 ... 398.399994 400.609985
438 438 2021-04-03 ... 398.399994 400.609985
[439 rows x 20 columns]
Unnamed: 0 0.135422
cumCasesByPublishDate 0.375792
cumVirusTests 0.338769
cumDailyNsoDeathsByDeathDate 0.212699
cumPeopleVaccinatedCompleteByPublishDate 0.225391
cumAdmissions 0.266943
FTSE_Open_Price 0.980909
FTSE_High_Price 0.989448
FTSE_Low_Price 0.994536
FTSE_Volume -0.332906
Max_Temperature_DegC -0.093221
Min_Temperature_DegC -0.076366
Rainfall_mm 0.023604
Sun_Hours -0.076971
US_Covid_Cases 0.343327
US_Covid_Deaths 0.263249
SPY_Open_Price 0.577615
SPY_Close_Price 0.577433
dtype: float64
predictors: cumCasesByPublishDate cumVirusTests ... SPY_Open_Price SPY_Close_Price
0 0.0 0.000000e+00 ... 330.899994 331.299988
1 0.0 5.913656e+03 ... 332.239990 331.339996
2 0.0 1.182731e+04 ... 330.630005 331.720001
3 0.0 1.774097e+04 ... 332.440002 328.769989
4 0.0 2.365462e+04 ... 332.440002 328.769989
.. ... ... ... ... ...
434 4341736.0 1.228242e+08 ... 394.420013 394.730011
435 4345788.0 1.241472e+08 ... 395.339996 396.329987
436 4350266.0 1.241472e+08 ... 398.399994 400.609985
437 4353668.0 1.241472e+08 ... 398.399994 400.609985
438 4357091.0 1.241472e+08 ... 398.399994 400.609985
[439 rows x 17 columns]
Xs scaled by SIQR:
FTSE_Close_Price Unnamed: 0 ... SPY_Open_Price SPY_Close_Price
count 439.000000 439.00000 ... 439.000000 439.000000
mean 6287.465991 219.00000 ... 7.761376 7.505738
std 501.205287 126.87264 ... 0.911658 0.883789
min 4993.890000 0.00000 ... 5.293160 5.001330
25% 5935.980000 109.50000 ... 7.189108 6.928621
50% 6224.070000 219.00000 ... 7.789776 7.512427
75% 6594.775000 328.50000 ... 8.538088 8.277600
max 7534.370000 438.00000 ... 9.241399 8.986692
[8 rows x 19 columns]
Train set: (329, 17)
Test set : (110, 17)
Pseudo R² = (correlation(y, (fit or prediction)))**2
Coefficient of Determination = 1 - ((y-(fit or prediction)**2).sum()/((y-y.mean())**2).sum() = 1 - SSResidual/SSTotal [can be <0]
OLS Regression:
LinearRegression(n_jobs=-1)
OLS
'coef'
cumCasesByPublishDate -3.533093e-05
cumVirusTests 1.812631e-06
cumDailyNsoDeathsByDeathDate -3.419427e-03
cumPeopleVaccinatedCompleteByPublishDate -2.191563e-05
cumAdmissions 1.976120e-03
FTSE_Open_Price -3.428070e-01
FTSE_High_Price 5.734527e-01
FTSE_Low_Price 7.816263e-01
FTSE_Volume -1.479742e-08
Max_Temperature_DegC -2.957246e+01
Min_Temperature_DegC 3.320196e+01
Rainfall_mm -8.090846e-01
Sun_Hours 4.152645e-01
US_Covid_Cases -1.293807e-05
US_Covid_Deaths 1.070164e-04
SPY_Open_Price -4.549301e+00
SPY_Close_Price 4.186083e+00
Train set:
OLS Regression RMSE = 39.29569922673521
OLS Regression R² = 0.9940220247126008
OLS Regression Coef of Determination = 0.9940220247125998
Test set:
OLS Regression RMSE = 34.48427851488304
OLS Regression Pseudo R² = 0.9950326305927016
OLS Regression Coef of Determination = 0.9946809372589881
Ridge Regression:
Ridge(max_iter=10000)
Ridge
'coef'
cumCasesByPublishDate 1.171423e-05
cumVirusTests 1.107448e-06
cumDailyNsoDeathsByDeathDate 2.125026e-04
cumPeopleVaccinatedCompleteByPublishDate -8.583817e-06
cumAdmissions 2.021898e-04
FTSE_Open_Price -1.964661e-01
FTSE_High_Price 4.822698e-01
FTSE_Low_Price 6.911287e-01
FTSE_Volume -1.828834e-08
Max_Temperature_DegC -3.069364e+01
Min_Temperature_DegC 3.422443e+01
Rainfall_mm -7.988687e-01
Sun_Hours 4.786664e-01
US_Covid_Cases -4.448333e-06
US_Covid_Deaths -2.771053e-04
SPY_Open_Price -2.033553e+00
SPY_Close_Price 2.255360e+00
Train set:
Ridge Regression RMSE = 41.47970344208789
Ridge Regression Pseudo R² = 0.9933455846324617
Ridge Regression Coef of Determination = 0.9933390624970353
Test set:
Ridge Regression RMSE = 36.87254520795701
Ridge Regression Pseudo R² = 0.9941348687355811
Ridge Regression Coef of Determination = 0.9939186631707337
Lasso Regression:
Lasso(max_iter=10000)
Lasso
'coef'
cumCasesByPublishDate 5.389852e-07
cumVirusTests 0.000000e+00
cumDailyNsoDeathsByDeathDate 0.000000e+00
cumPeopleVaccinatedCompleteByPublishDate 3.231085e-06
cumAdmissions 0.000000e+00
FTSE_Open_Price -1.142847e-01
FTSE_High_Price 3.670547e-01
FTSE_Low_Price 7.295107e-01
FTSE_Volume -9.262284e-09
Max_Temperature_DegC -0.000000e+00
Min_Temperature_DegC 4.922681e-01
Rainfall_mm -3.862259e-02
Sun_Hours 0.000000e+00
US_Covid_Cases -0.000000e+00
US_Covid_Deaths -0.000000e+00
SPY_Open_Price -4.658666e-02
SPY_Close_Price 0.000000e+00
Train set:
Lasso Regression RMSE = 46.08576352635423
Lasso Regression Pseudo R² = 0.9917833443604385
Lasso Regression Coef of Determination = 0.9917776180403057
Test set:
Lasso Regression RMSE = 40.01385534519256
Lasso Regression Pseudo R² = 0.9929934445300771
Lasso Regression Coef of Determination = 0.9928383412942803
Elastic Net Regression:
ElasticNet(max_iter=10000, selection='random', warm_start=True)
El-Net
'coef'
cumCasesByPublishDate 1.295827e-05
cumVirusTests 9.695505e-08
cumDailyNsoDeathsByDeathDate -6.246751e-04
cumPeopleVaccinatedCompleteByPublishDate 1.294890e-05
cumAdmissions -4.750214e-05
FTSE_Open_Price 2.475632e-01
FTSE_High_Price 2.624542e-01
FTSE_Low_Price 2.676444e-01
FTSE_Volume -6.467246e-08
Max_Temperature_DegC -3.465102e+00
Min_Temperature_DegC -0.000000e+00
Rainfall_mm -0.000000e+00
Sun_Hours 3.306819e-02
US_Covid_Cases 1.611214e-07
US_Covid_Deaths -5.649593e-05
SPY_Open_Price 6.747196e-01
SPY_Close_Price 6.715642e-01
Train set:
Elastic Net Regression RMSE = 94.60121959137842
Elastic Net Regression Pseudo R² = 0.9800203051676568
Elastic Net Regression Coef of Determination = 0.9653536455595862
Test set:
Elastic Net Regression RMSE = 92.42239479680823
Elastic Net Regression Pseudo R² = 0.9799322654873668
Elastic Net Regression Coef of Determination = 0.9617926198023452
Train-set coefficients comparison:
OLS ... El-Net
'coef' ...
cumCasesByPublishDate -3.533093e-05 ... 1.295827e-05
cumVirusTests 1.812631e-06 ... 9.695505e-08
cumDailyNsoDeathsByDeathDate -3.419427e-03 ... -6.246751e-04
cumPeopleVaccinatedCompleteByPublishDate -2.191563e-05 ... 1.294890e-05
cumAdmissions 1.976120e-03 ... -4.750214e-05
FTSE_Open_Price -3.428070e-01 ... 2.475632e-01
FTSE_High_Price 5.734527e-01 ... 2.624542e-01
FTSE_Low_Price 7.816263e-01 ... 2.676444e-01
FTSE_Volume -1.479742e-08 ... -6.467246e-08
Max_Temperature_DegC -2.957246e+01 ... -3.465102e+00
Min_Temperature_DegC 3.320196e+01 ... -0.000000e+00
Rainfall_mm -8.090846e-01 ... -0.000000e+00
Sun_Hours 4.152645e-01 ... 3.306819e-02
US_Covid_Cases -1.293807e-05 ... 1.611214e-07
US_Covid_Deaths 1.070164e-04 ... -5.649593e-05
SPY_Open_Price -4.549301e+00 ... 6.747196e-01
SPY_Close_Price 4.186083e+00 ... 6.715642e-01
[17 rows x 4 columns]
Test-set Pseudo R² scores comparison:
OLS = 0.9950326305927016
Ridge = 0.9941348687355811
Lasso = 0.9929934445300771
Elastic Net = 0.9799322654873668
y_test vs its prediction by x_test:
'''
Regression Regularization: balance optimization of fit and simplicity
https://www.pluralsight.com/guides/linear-lasso-ridge-regression-scikit-learn
https://fred.stlouisfed.org #federal reserve bank of st. louis
Linear Regression
Ridge Regression
Lasso Regression
Elastic Net Regression
'''
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
df_rl = df_clean.copy()
#set maximum window width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) #change code window width to max
pd.options.display.max_columns = 0 #change output window width to max
a = input('Positive number for regularization strength (i.e., alpha or lambda) [default: 1.0]? ').strip()
try:
a = float(a)
except:
a = 1.0
print(a, 'assumed.')
mr = input('\nElasticNet mixing parameter (0 ≤, ≤ 1) [default: 0.5]? ').strip()
try:
mr = float(mr)
if mr < 0 or mr > 1:
mr = 0.5
print(mr, 'assumed.')
else:
print(mr, 'entered.')
except:
mr = 0.5
print(mr, 'assumed.')
rs = input('\n1: Reproducible output (input any integer except 2), 2: Random train-test data split [default: 1]? ').strip()
if rs == '2':
rs = int(rs)
else:
try:
if rs == '':
rs = 1
else:
rs = int(rs)
print(rs, 'entered.')
except:
rs = 1 #can be changed to any integer for reproducible randomization
print(rs, 'assumed.')
print()
print(df_rl)
#print(df.shape)
#print(df.describe())
y=df_rl.pop('SPY_Close_Price')
df_rl.insert(0,'SPY_Close_Price',y)
target = ['SPY_Close_Price']
df_rl = df_rl.select_dtypes(include='number').replace([float('inf'), float('-inf')], float('nan')).dropna(
0, subset=target).dropna(1)
print()
print(df_rl.drop(target[0], 1).corrwith(df_rl[target[0]]), '\n')
predictors = list(df_rl)
predictors=predictors[2:]
#predictors = list(set(list(df.columns)) - set(target)) #but ordering for columns can change
#scale the x variables by SIQR to make betas (coefficients) 'comparable' in penalty function (could instead use normalize=True but more affected by outliers):
from scipy.stats import iqr, norm
# df[predictors] = df[predictors] / df[predictors].max() #simpler scaling but not ideal
# df[predictors] = df[predictors].apply(lambda x: (x - x.median()) / iqr(x) * (norm.ppf(0.75) * 2)) #no need to center
print('predictors:',df_rl[predictors])
siqr = df_rl[predictors].apply(lambda x: iqr(x) / (norm.ppf(0.75) * 2)).rename('siqr')
#####
if (siqr == 0).any():
from statistics import stdev
siqr.loc[siqr.index[siqr == 0]] = df_rl[siqr.index[siqr == 0]].apply(lambda x: stdev(x))
df_rl[predictors] = df_rl[predictors] / siqr
#could instead have, e.g., Ridge(normalize=True)
print('\nXs scaled by SIQR:')
print(df_rl.describe())
X = df_rl[predictors].values
y = df_rl[target].values
#random state for train_test_split
# rs = 1 #any integer; =None for random
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs) #random sampling for each run
print('\nTrain set:', X_train.shape); print('Test set :', X_test.shape)
print('\n Pseudo R² = (correlation(y, (fit or prediction)))**2')
print('Coefficient of Determination = 1 - ((y-(fit or prediction)**2).sum()/((y-y.mean())**2).sum() = 1 - SSResidual/SSTotal [can be <0]\n')
#Linear Regression
print('\nOLS Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
from sklearn.linear_model import LinearRegression
lr = LinearRegression(n_jobs=-1) #use all CPU threads
print(lr.fit(X_train, y_train))
pred_train_lr = lr.predict(X_train)
#just 1 peep at train-set fit:
import matplotlib.pyplot as pl
%matplotlib inline
plt.rcParams["figure.figsize"] = (4.5, 4.5)
plt.scatter(pred_train_lr, y_train, s=3)
from sklearn.metrics import r2_score #Coefficient of Determination (can be negative), not (corr(y, y_fit))**2
def r2(y, x, cor=1):
if cor == 1:
#most reliable correlation; works even with nan
x = pd.DataFrame(x).squeeze() #x is now a Series
return (pd.concat([pd.DataFrame(y), x], 1).corr().iloc[0, 1] ** 2 if len(x.dropna().unique()) > 1 else 0)
else:
#coefficient of determination, which can be negative
return r2_score(y, x)
plt.title('y_train vs fit by x_train, R² = ' + str(round(r2(y_train, pred_train_lr), 3)))
plt.show()
# print(pd.DataFrame(lr.coef_[0] / siqr).rename(columns={0: 'OLS'}))
cfOLS = pd.Series(lr.coef_[0] / siqr, predictors).rename('OLS').to_frame()
cfOLS.index.name = "'coef'"
print(cfOLS)
print('\nTrain set:')
import numpy as np
from sklearn.metrics import mean_squared_error
print('OLS Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_lr)))
print('OLS Regression R² =', r2(y_train, pred_train_lr))
print('OLS Regression Coef of Determination =', r2(y_train, pred_train_lr, 0))
pred_test_lr = lr.predict(X_test)
print('\nTest set:')
print('OLS Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_lr)))
r2_ols = r2(y_test, pred_test_lr)
print('OLS Regression Pseudo R² =', r2_ols)
print('OLS Regression Coef of Determination =', r2(y_test, pred_test_lr, 0))
#alpha/lambda parameter for ridge, lasso & elastic net:
# a = 1.0 #any positive float
#Ridge Regression
print('\n\nRidge Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
from sklearn.linear_model import Ridge
#minimize ||y - Xw||^2_2 + alpha * ||w||^2_2
rr = Ridge(alpha=a, normalize=False, max_iter=10000) #increase max_iter to 10000 for comparison with later methods
print(rr.fit(X_train, y_train), '\n')
cfRidge = pd.Series(rr.coef_[0] / siqr, predictors).rename('Ridge').to_frame()
cfRidge.index.name = "'coef'"
print(cfRidge)
print('\nTrain set:')
pred_train_rr = rr.predict(X_train)
print('Ridge Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_rr)))
print('Ridge Regression Pseudo R² =', r2(y_train, pred_train_rr))
print('Ridge Regression Coef of Determination =', r2(y_train, pred_train_rr, 0))
print('\nTest set:')
pred_test_rr = rr.predict(X_test)
print('Ridge Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_rr)))
r2_ridge = r2(y_test, pred_test_rr)
print('Ridge Regression Pseudo R² =', r2_ridge)
print('Ridge Regression Coef of Determination =', r2(y_test, pred_test_rr, 0))
#Lasso Regression
print('\n\nLasso Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
from sklearn.linear_model import Lasso
#minimize (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
model_lasso = Lasso(alpha=a, normalize=False, max_iter=10000) #increase max_iter to 10000 for convergence
print(model_lasso.fit(X_train, y_train), '\n')
cfLasso = pd.Series(model_lasso.coef_ / siqr, predictors).rename('Lasso').to_frame()
cfLasso.index.name = "'coef'"
print(cfLasso)
print('\nTrain set:')
pred_train_lasso = model_lasso.predict(X_train)
print('Lasso Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_lasso)))
print('Lasso Regression Pseudo R² =', r2(y_train, pred_train_lasso))
print('Lasso Regression Coef of Determination =', r2(y_train, pred_train_lasso, 0))
print('\nTest set:')
pred_test_lasso = model_lasso.predict(X_test)
print('Lasso Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_lasso)))
r2_lasso = r2(y_test, pred_test_lasso)
print('Lasso Regression Pseudo R² =', r2_lasso)
print('Lasso Regression Coef of Determination =', r2(y_test, pred_test_lasso, 0))
#Elastic Net Regression
# mr = l1-norm/l2-norm ratio, [0, 1], default 0.5
print('\n\nElastic Net Regression:\n')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
from sklearn.linear_model import ElasticNet
#minimize 1 / (2 * n_samples) * ||y - Xw||^2_2 + alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2
#https://stackoverflow.com/questions/47365978/scikit-learn-elastic-net-approaching-ridge
#the weights for ridge & lasso sum to alpha and not 1
#l1_ratio=1 yeilds lasso, but l1_ratio=0 does not yield ridge
#increase max_iter to 10000 (default 1000) for convergence:
# model_enet = ElasticNet(alpha=a, max_iter=1000)
model_enet = ElasticNet(alpha=a, normalize=False, l1_ratio=mr, max_iter=10000, warm_start=True, selection='random')
print(model_enet.fit(X_train, y_train), '\n')
cfElNet = pd.Series(model_enet.coef_ / siqr, predictors).rename('El-Net').to_frame()
cfElNet.index.name = "'coef'"
print(cfElNet)
print('\nTrain set:')
pred_train_enet = model_enet.predict(X_train)
print('Elastic Net Regression RMSE =', np.sqrt(mean_squared_error(y_train, pred_train_enet)))
print('Elastic Net Regression Pseudo R² =', r2(y_train, pred_train_enet))
print('Elastic Net Regression Coef of Determination =', r2(y_train, pred_train_enet, 0))
print('\nTest set:')
pred_test_enet = model_enet.predict(X_test)
print('Elastic Net Regression RMSE =', np.sqrt(mean_squared_error(y_test, pred_test_enet)))
r2_enet = r2(y_test, pred_test_enet)
print('Elastic Net Regression Pseudo R² =', r2_enet)
print('Elastic Net Regression Coef of Determination =', r2(y_test, pred_test_enet, 0))
#compare coefs
print('\n\nTrain-set coefficients comparison:')
print(pd.concat([cfOLS, cfRidge, cfLasso, cfElNet], 1))
#compare r2s
print('\n\nTest-set Pseudo R² scores comparison:')
print('OLS =', r2_ols)
print('Ridge =', r2_ridge)
print('Lasso =', r2_lasso)
print('Elastic Net =', r2_enet)
#https://stackoverflow.com/questions/42818361/how-to-make-two-plots-side-by-side-using-python
plt.rcParams["figure.figsize"] = (20.2, 4.5)
# pl.tight_layout()
#plot y_test vs y_predict
print('\ny_test vs its prediction by x_test:')
plt.subplot(1, 4, 1) #1 row, 4 columns, plot 1
plt.title(' OLS, Pseudo R² = ' + str(round(r2_ols, 3)))
plt.scatter(pred_test_lr, y_test, s=3)
plt.subplot(1, 4, 2) #1 row, 4 columns, plot 2
plt.title(' Ridge, Pseudo R² = ' + str(round(r2_ridge, 3)))
plt.scatter(pred_test_rr, y_test, s=3)
plt.subplot(1, 4, 3) #1 row, 4 columns, plot 3
plt.title(' Lasso, Pseudo R² = ' + str(round(r2_lasso, 3)))
plt.scatter(pred_test_lasso, y_test, s=3)
plt.subplot(1, 4, 4) #1 row, 4 columns, plot 4
plt.title(' Elastic Net, Pseudo R² = ' + str(round(r2_enet, 3)))
plt.scatter(pred_test_enet, y_test, s=3)
plt.show()
Positive number for regularization strength (i.e., alpha or lambda) [default: 1.0]?
1.0 assumed.
ElasticNet mixing parameter (0 ≤, ≤ 1) [default: 0.5]?
0.5 assumed.
1: Reproducible output (input any integer except 2), 2: Random train-test data split [default: 1]?
1 entered.
date cumCasesByPublishDate ... SPY_Open_Price SPY_Close_Price
0 NaN 0.0 ... 330.899994 331.299988
1 NaN 0.0 ... 332.239990 331.339996
2 NaN 0.0 ... 330.630005 331.720001
3 NaN 0.0 ... 332.440002 328.769989
4 NaN 0.0 ... 332.440002 328.769989
.. ... ... ... ... ...
434 NaN 4341736.0 ... 394.420013 394.730011
435 NaN 4345788.0 ... 395.339996 396.329987
436 NaN 4350266.0 ... 398.399994 400.609985
437 NaN 4353668.0 ... 398.399994 400.609985
438 NaN 4357091.0 ... 398.399994 400.609985
[439 rows x 19 columns]
cumCasesByPublishDate 0.811592
cumVirusTests 0.834378
cumDailyNsoDeathsByDeathDate 0.824761
cumPeopleVaccinatedCompleteByPublishDate 0.517156
cumAdmissions 0.835946
FTSE_Open_Price 0.547642
FTSE_Close_Price 0.577433
FTSE_High_Price 0.533058
FTSE_Low_Price 0.594882
FTSE_Volume -0.473475
Max_Temperature_DegC -0.050928
Min_Temperature_DegC -0.036445
Rainfall_mm 0.020962
Sun_Hours -0.087809
US_Covid_Cases 0.858229
US_Covid_Deaths 0.871273
SPY_Open_Price 0.996157
dtype: float64
predictors: cumVirusTests ... SPY_Open_Price
0 0.000000e+00 ... 330.899994
1 5.913656e+03 ... 332.239990
2 1.182731e+04 ... 330.630005
3 1.774097e+04 ... 332.440002
4 2.365462e+04 ... 332.440002
.. ... ... ...
434 1.228242e+08 ... 394.420013
435 1.241472e+08 ... 395.339996
436 1.241472e+08 ... 398.399994
437 1.241472e+08 ... 398.399994
438 1.241472e+08 ... 398.399994
[439 rows x 16 columns]
Xs scaled by SIQR:
SPY_Close_Price cumCasesByPublishDate ... US_Covid_Deaths SPY_Open_Price
count 439.000000 4.390000e+02 ... 439.000000 439.000000
mean 334.591846 1.215935e+06 ... 1.244394 7.761376
std 39.397654 1.494139e+06 ... 1.008129 0.911658
min 222.949997 0.000000e+00 ... 0.000000 5.293160
25% 308.865005 2.172215e+05 ... 0.479213 7.189108
50% 334.890015 3.303680e+05 ... 1.092153 7.789776
75% 369.000000 1.878891e+06 ... 1.828193 8.538088
max 400.609985 4.357091e+06 ... 3.345151 9.241399
[8 rows x 18 columns]
Train set: (329, 16)
Test set : (110, 16)
Pseudo R² = (correlation(y, (fit or prediction)))**2
Coefficient of Determination = 1 - ((y-(fit or prediction)**2).sum()/((y-y.mean())**2).sum() = 1 - SSResidual/SSTotal [can be <0]
OLS Regression:
LinearRegression(n_jobs=-1)
OLS
'coef'
cumVirusTests -2.908311e-08
cumDailyNsoDeathsByDeathDate 1.328169e-04
cumPeopleVaccinatedCompleteByPublishDate 7.853936e-07
cumAdmissions -7.793278e-05
FTSE_Open_Price 5.229964e-03
FTSE_Close_Price 2.732435e-02
FTSE_High_Price -1.090251e-02
FTSE_Low_Price -2.114003e-02
FTSE_Volume -1.433910e-09
Max_Temperature_DegC 1.411623e+00
Min_Temperature_DegC -1.481270e+00
Rainfall_mm 7.359586e-02
Sun_Hours 4.796993e-03
US_Covid_Cases 6.842855e-07
US_Covid_Deaths -6.834192e-07
SPY_Open_Price 9.675029e-01
Train set:
OLS Regression RMSE = 3.146511430655732
OLS Regression R² = 0.9938571920908992
OLS Regression Coef of Determination = 0.9938571920908997
Test set:
OLS Regression RMSE = 3.2096495971231693
OLS Regression Pseudo R² = 0.9927141937874558
OLS Regression Coef of Determination = 0.9924140560322812
Ridge Regression:
Ridge(max_iter=10000)
Ridge
'coef'
cumVirusTests -2.711860e-08
cumDailyNsoDeathsByDeathDate -4.508882e-06
cumPeopleVaccinatedCompleteByPublishDate 1.785801e-07
cumAdmissions -3.282521e-05
FTSE_Open_Price -7.614839e-04
FTSE_Close_Price 1.532234e-02
FTSE_High_Price -4.463588e-03
FTSE_Low_Price -7.480204e-03
FTSE_Volume -2.145386e-09
Max_Temperature_DegC 1.171894e+00
Min_Temperature_DegC -1.105094e+00
Rainfall_mm 6.934125e-02
Sun_Hours 8.979359e-03
US_Covid_Cases 1.455523e-07
US_Covid_Deaths 4.276766e-05
SPY_Open_Price 8.996507e-01
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-55-dc5a94b17474> in <module> 179 cfRidge.index.name = "'coef'" 180 print(cfRidge) --> 181 print("SUMMARY", cfRidge.summary2()) 182 print('\nTrain set:') 183 pred_train_rr = rr.predict(X_train) ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name) 5458 if self._info_axis._can_hold_identifiers_and_holds_name(name): 5459 return self[name] -> 5460 return object.__getattribute__(self, name) 5461 5462 def __setattr__(self, name: str, value) -> None: AttributeError: 'DataFrame' object has no attribute 'summary2'
# Variable Selection Function
def variableSelection(df, yvar):
data = df.copy()
flag = True;
y = data[yvar]
initm = True;
initial_model = None;
count = 0;
while flag:
# Build model
cols = [i for i in data.columns if i != yvar]
x = data[cols]
exog, endog = sm.add_constant(x), y
mod = sm.OLS(endog, exog).fit()
# Save the initial model
if initm:
initial_model = mod;
initm = False;
# Get the p-values for all the params
pvals = dict(mod.pvalues)
max_val = 0;
max_key = "";
# Iterate through keys to find max p-value param
for key in pvals.keys():
if key == "const":
continue;
if pvals[key] > max_val:
max_val = pvals[key];
max_key = key
# Drop the param with the largest p-value (for variable selection)
data.drop(max_key, axis=1, inplace=True)
count += 1;
# Terminate after 6 params reached (this is to create an explainable model for biz users)
if len(mod.params) == 6:
break;
# Return both final and initial models --> For comparison
return mod, initial_model;
df = pd.read_csv('cleaned_data.csv')
df["date"] = pd.to_datetime(df["date"])
initvar = [i for i in list(df.columns) if i != "date"]
cdf = df[initvar]
a,b = variableSelection(cdf,"cumCasesByPublishDate")
a.summary2()
var_list = list(dict(a.pvalues).keys())
var_list.insert(0, "date")
var_list.append("cumCasesByPublishDate")
var_list.remove("const")
df = df[var_list]
## Time Series ARIMA, ARIMAX or auto SARIMAX Forecast
#only have to specify filename, yname & timestamp-name; all other columns will be taken as Xs
#Xs should first go through regression variable selection
#categorical variables will be deleted
#missing values are automatically handled
#processed input data may be written to disk, just before analysis, using the parameter writedata
#warning messages only appears for first run
#SARIMAX Seasonal AutoRegressive Integrated Moving Average with eXogenous regressors
#https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_sarimax_stata.html
import time
stm = time.time()
import os
try:
os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data") #change "\0_Teach\data" accordingly
except:
try:
os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data") #for Mac
except:
#assume data file in current folder
pass
import pandas as pd
#df = pd.read_excel(r"COE_TimeSeries_lag5.xlsx", index_col=0) #ARIMAX
#in the target (y) column, leave tail periods to be forecasted blank
#older python does not read .xlsx:
# #df = pd.read_excel(r"COE_TimeSeries_lag5.xlsx").iloc[:, :2] #ARIMA, only 2 columns needed
# df = pd.read_excel(r"COE_TimeSeries_lag5.xlsx") #ARIMAX
# tn = 'Date' #time-stamp column, for index (row labels)
# #the index (row labels) must not have any NaN
# yn = 'CatECOE'#time series column
# pc = 24 #periodicity; randomly assumed here; e.g. weekly data would be 7, monthly 12, quarterly 4
# #input y2 for testing set if available; else, comment out next 3 lines:
# y2 = [44000, 39903, 38801, 39000, 38000] #n2 actual future values kept for comparing with forecasts
# df = pd.read_csv('SARIMAX_grid-search_prepared_data.csv')
# df.drop(['FTSE100 Close', 'Bitcoin Price', 'Crude Oil Price'], axis=1, inplace=True)
tn = 'date' #time serial column name
yn = "cumCasesByPublishDate"
pc = 7
# y2 = [1643, 1875, 1938, 1986, 1944, 1982, 1592] #n2 actual future values kept for comparing with forecasts
#if frequency not inferred correctly, then freq=None :
df.index = pd.to_datetime(df[tn], infer_datetime_format=True)
#df.index = pd.DatetimeIndex(df[tn], freq=pd.infer_freq(df[tn]))
# f = 'SMS-16' #frequency specified; might be changed by inference
# if df.index.freq == None:
# #reset freq to inferred if possible
# #warning: this frequently fails!
# z = pd.infer_freq(df.index)
# #print('Inferred freq =', z, '\n')
# if z != None:
# if f != 'SMS-16' or z != '15D':
# print('Setting freq to inferred "' + z + '".\n')
# f = z
# elif f == '':
# f = z
# else:
# f = df.index.freq
# df.index.freq = f
print('Dataset has', len(df), 'rows and the following columns:')
print(list(df), '\n')
import numpy as np
# #artificially set some NaNs at the start and in the middle for y:
# df.iloc[:4, list(df).index(yn)] = np.nan
# df.iloc[10:14, list(df).index(yn)] = np.nan
# print(df.iloc[:15, list(df).index(yn)])
y = pd.DataFrame(df[yn])
s = list(y.index).index(y.isnull().ne(True).idxmax()[0]) + 1 #position of first non-missing at the start
print(df.head(max(s, 3))) #show all of any missing at the start
y.interpolate(limit_area='inside', inplace=True) #fill in missing in the middle
e = max((len(y) - y.count())[0] - s + 2, 3)
print(df.tail(e)) #shows all of any missing at the end
#if not time series, to remove any row with NaN, inf or -inf: df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
#delete any row with missing y at the start:
df = df.select_dtypes(include='number').iloc[s - 1:, :] #only use numeric columns
df.interpolate(limit_area='inside', inplace=True) #interpolate, but not extrapolate for either end
# x = df.drop(yn, 1)
# df.drop(yn, 1).interpolate(limit_direction='both', inplace=True) #fill in missing at the beginning & the end
#fill in missing at the beginning & the end:
df = pd.concat([y.iloc[s - 1:, :], df.drop(yn, 1).interpolate(limit_direction='both')], 1)
#also df.bfill() & df.ffill()
writedata = False #write processed data to disk
if writedata:
kname = 'auto_SARIMAX_prepared_data.csv'
if os.path.exists(kname):
os.remove(kname)
pd.concat([y, x], 1).to_csv(kname)
print('\n' + kname, 'written to disk.')
del kname
del os, writedata
if 'y2' in vars():
#using df.index was inspired by Ong Jun Hong:
y2 = pd.Series(y2, index=pd.date_range(df.index[-(len(df[yn]) - df[yn].count() + 1)], periods=len(y2)+1,
freq=df.index.freq)[1:]).rename(yn + ' (unused)')
#done preparing time series
n0 = int(df[yn].count()) #number of non-Nan values; type(df[yn].count())=<class 'numpy.int32'>
n2 = len(df) - n0 #periods to forecast ahead
### CHanged
n2 = 14
n2 = pc if n2 == 0 else n2
n = min(3653, n0) #use only the last 3653 observations
if n != n0:
print('Only the last', n, 'of', n0, 'observations will be used for fitting.\n')
n1 = min(len(df[yn]), n + n2)
y = df[yn][-n1:-n2] #last 5 rows have no row labels
x = df.iloc[-n1:-n2, 1:] #for fitting in-sample values
x2 = df.iloc[-n2:, 1:] #for forecasting future values
xc = x.shape[1] #number of X variables
#fit ARIMA to time series
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>"))
p, d, q = 1, 1, 1
print(f'\033[1m' + '\nFitting ARIMA' + ('X' if xc > 0 else '') + '(' + str(p) + ',' + str(d) + ',' + str(q)
+ ') for the last', len(y), "obs of '" + yn + ("' using " + str(xc) + ' X-variables: '
+ str(list(x)) if xc > 0 else "'")
+ '...\n' + f'\033[0m') #in bold
from statsmodels.tsa.arima_model import ARIMA
# from statsmodels.tsa.arima.model import ARIMA
#https://www.statsmodels.org/stable/generated/statsmodels.tsa.arima.model.ARIMA.html
#https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMA.fit.html
#out = ARIMA(y, order=(1,1,1), missing='raise').fit(trend='c', transparams=True)
out = ARIMA(y, (p,d,q), x).fit() if xc > 0 else ARIMA(y, (p,d,q)).fit()
# out = ARIMA(y, x, (p,d,q)).fit() if xc > 0 else ARIMA(y, (p,d,q)).fit()
#(p,d,q)=(1,1,1) is only as an example here
#print(dir(out))
print(out.summary(), '\n')
#check reliability of model:
tt = out.tvalues #z-values
if min(tt) > -np.inf and max(tt) < np.inf:
tt = out.summary().as_text()
tt = tt[tt.find('Kurtosis') - 7:][:4] #Prob(H) (two-sided)
if tt == ' nan' or tt == '1.00' or tt == '0.00':
iss = True
else:
iss = False
else:
iss = True
if iss:
print('This model is not reliable!\n')
del tt, iss
#https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMAResults.predict.html
#ft = out.fittedvalues #fitted values for d-integrated(differenced) series, not original series
ft = out.predict(typ='levels') #without 'levels', fitted output will be for d-differenced series
yh = pd.concat((y, ft.rename('Fitted')), 1) #y & yhat
print(yh)
#plot time series vs fit and its 'line'
#plot 'regression' line
mm = np.array([min(ft), max(ft)])
y0 = y[len(y)-len(ft):] #y & ft can have different lengths
import matplotlib.pyplot as pl
%matplotlib inline
pl.rcParams['figure.figsize'] = 7, 7 #square plot
# pl.rcParams['lines.linewidth'] = 1.0
# pl.plot(mm, sum(np.polyfit(ft, y0, 1) * [mm, 1]), 'red') #increase linewidth for darker line
pl.plot(mm, sum(np.polyfit(ft, y0, 1) * [mm, 1]), 'r', lw=1) #increase linewidth for darker line
#scatter time series vs fit
# pl.rcParams['lines.markersize'] = 2.0
# pl.scatter(ft, y0)
pl.scatter(ft, y0, s=3)
pl.ylabel(yn)
pl.xlabel('Fitted ' + yn)
pl.title('R² = ' + str('%.4f' % yh.corr().iloc[1, 0]**2) + ' for ' + str(len(y0)) +
' obs by ARIMA' + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
+ str(q) + ')' + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
#print('forecast error:', out._forecast_error) #doesn't work
# out.sigma2 is a bit different from sum((y-ft).dropna()**2)/len(ft)
print('StdErr(Fitted) =', np.sqrt(out.sigma2), '(same as for 1st forecast value below)' '\n\n')
# print('StdErr(Fitted) =', np.sqrt(out.mse), '(same as for 1st forecast value below)' '\n\n')
#forecast future values
#future 2.5% or 5 periods of time series:
#stnd = pd.date_range(y.index[-1], periods=max(1, min(n2, len(y)//40)) + 1, freq=y.index.freq)
stnd = pd.date_range(y.index[-1], periods=n2+1, freq=y.index.freq)
print(f'\033[1m' + 'ARIMA forecast for', n2, "future values of '" + yn
+ ("' using " + str(xc) + ' X-variables' if xc > 0 else "'") + '...\n' + f'\033[0m')
#https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMAResults.forecast.html
fc, stderr, confint = out.forecast(n2, x2) if xc > 0 else out.forecast(n2)
# print(pd.concat((pd.Series(fc).rename('Forecast'), pd.Series(stderr).rename('Std Error')), 1
# ).set_index(stnd[1:])) #also works
# print(pd.concat((pd.Series(fc, name='Forecast'), pd.Series(stderr, name='Std Error')), 1
# ).set_index(stnd[1:])) #also works
print(pd.concat((pd.Series(fc), pd.Series(stderr)), 1, keys=['Forecast', 'Std Error']
).set_index(stnd[1:]))
fc = pd.Series(fc, index=pd.date_range(ft.index[-1], periods=len(fc)+1,
freq=ft.index.freq)[1:]).rename('Forecast')
#plot last 200+ training periods' observed & fitted, & forecast
nb = -(-200 // pc * pc) if pc > 1 else 200 #last training periods to plot
pl.rcParams['figure.figsize'] = 18, 8 #use the whole width of window
#https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html
pl.title('Last ' + str(nb) + ' observed & fitted values, & ' + str(n2) + " forecasts of '"
+ yn + "' by ARIMA" + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
+ str(q) + ')' + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.plot(ft[-nb:], label='Fit')
pl.plot(ft[-1:].append(fc), c='cyan', label=fc.name)
pl.plot(y[-nb:], label=yn)
#https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html
# #place legend precisely for 18x8 plot & 2 series; might cover title
# pl.legend(bbox_to_anchor=(1.005, 1.087))
pl.legend(bbox_to_anchor=(1.005, 1.165))
#pl.legend(loc='best') #avoid plotted space within the frame, but cpu-intensive
#pl.legend(loc='upper left')
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.show()
#plot last few training periods' observed & fitted, & forecast
#last 10% or 24 periods of time series:
lt = max(2, min(max(2 * pc, 24), (len(y) - 1) // 10))
start = y.index[-lt]
#future 2.5% or 5 periods of time series:
end = stnd[-1]
# #https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARMAResults.plot_predict.html
# out.plot_predict(start, end, x2) if xc > 0 else out.plot_predict(start, end)
# pl.plot(ft[-1:].append(fc), c='cyan')
# pl.legend(bbox_to_anchor=(1.005, 1.180)) #place legend precisely for 18x8 plot to avoid title
# #pl.legend(loc='upper left') #relocate the legend
# #pl.xticks(rotation=90, ha='center', rotation_mode='anchor') #center labels at tick marks
# pl.xticks(rotation=90, ha='center') #center labels at tick marks
# pl.xlabel(y.index.name)
# pl.ylabel(yn)
# pl.title('Last ' + str(lt) + ' observed & fitted values, & ' + str(n2) + " forecasts of '"
# + yn + "' by ARIMA" + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
# + str(q) + ')' + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
# pl.show()
#Fit SARIMAX Seasonal AutoRegressive Integrated Moving Average with X variables
import warnings
#suppress warning messages (those with peach background)
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
FutureWarning)
#do below only once, to install pmdarima
#alkaline-ml no longer available:
#!conda install -c alkaline-ml pmdarima --y
#for python before 3.8:
#!conda install -c Saravji pmdarima --y
#as last resort:
#pip install pmdarima
#https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.sarimax.SARIMAX.html
#https://www.statsmodels.org/devel/examples/notebooks/generated/statespace_sarimax_pymc3.html
#https://stackoverflow.com/questions/22770352/auto-arima-equivalent-for-python
#https://iprokin.github.io/posts/2017-01-08-idARIMA.html
try:
#https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html
from pmdarima.arima import auto_arima #wrapper around statsmodels arima
except:
import six
import sys
sys.modules['sklearn.externals.six'] = six
import joblib
sys.modules['sklearn.externals.joblib'] = joblib
from pmdarima.arima import auto_arima #wrapper around statsmodels arima
print(f'\033[1m' + '\nFitting Auto SARIMA(X) for the last', len(y), "obs of '" + yn +
("' using " + str(xc) + ' X-variables: ' + str(list(x)) if xc > 0 else "'")
+ '...' + f'\033[0m', '(this may take time)\n') #in bold
#o = auto_arima(y) #auto ARIMA; not SARIMA
#1 = no seasonality; 7 for daily, 4 for quarterly, 12 for monthly:
if xc > 0:
#auto SARIMAX
o = auto_arima(y, x, start_p=0, start_q=0, max_p=pc, max_d=3, max_q=pc, #for y
start_P=0, start_Q=0, max_P=pc, max_D=3, max_Q=pc, #for seasonal
max_order=None, #p+q+P+Q; None means no constraints on maximum order
#o = auto_arima(y, x,
# m=1 if pc == 24 else pc, #'m != 1' makes this code run for longer
m=pc, #'m != 1' makes this code run for longer
suppress_warnings=True, error_action='ignore', n_jobs=-1)
#n_jobs=-1 above tries to use all CPU threads
fit = o.predict_in_sample(x)
if type(fit) == np.ndarray:
fit = pd.Series(fit, index=y.index)
else:
#auto SARIMA
o = auto_arima(y, start_p=0, start_q=0, max_p=pc, max_d=3, max_q=pc, #for y
start_P=0, start_Q=0, max_P=pc, max_D=3, max_Q=pc, #for seasonal
max_order=None, #p+q+P+Q; None means no constraints on maximum order
#o = auto_arima(y,
# m=1 if pc == 24 else pc, #'m != 1' makes this code run for longer
m=pc, #'m != 1' makes this code run for longer
suppress_warnings=True, error_action='ignore', n_jobs=-1)
fit = o.predict_in_sample() #fit
#fit is numpy array when no x
fit = pd.Series(fit, index=pd.date_range(y.index[-len(fit)], periods=len(fit),
freq=y.index.freq))
# print(dir(o), '\n')
# ['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__',
# '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__',
# '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__',
# '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__',
# '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_exog',
# '_clear_cached_state', '_fit', '_get_param_names', '_get_tags', '_legacy_set_state',
# '_more_tags', '_warn_for_older_version', 'aic', 'aicc', 'arima_res_', 'arparams',
# 'arroots', 'bic', 'bse', 'conf_int', 'df_model', 'df_resid', 'fit', 'fit_predict',
# 'fit_with_exog_', 'get_params', 'hqic', 'maparams', 'maroots', 'maxiter', 'method',
# 'nobs_', 'oob', 'oob_', 'oob_preds_', 'order', 'out_of_sample_size', 'params',
# 'pkg_version_', 'plot_diagnostics', 'predict', 'predict_in_sample', 'pvalues', 'resid',
# 'sarimax_kwargs', 'scoring', 'scoring_args', 'seasonal_order', 'set_params',
# 'start_params', 'summary', 'suppress_warnings', 'to_dict', 'trend', 'update',
# 'with_intercept']
# print(o.scoring, '\n')
print(o.summary(), '\n')
#http://medium.com/@josemarcialportilla/using-python-and-auto-arima-to-forecast-seasonal-time-series-90877adff03c
#check reliability of model:
try:
tt = o.params()/o.bse() #z-values
if min(tt) > -np.inf and max(tt) < np.inf:
tt = o.summary().as_text()
tt = tt[tt.find('Kurtosis') - 7:][:4] #Prob(H) (two-sided)
if tt == ' nan' or tt == '1.00' or tt == '0.00':
iss = True
else:
iss = False
else:
iss = True
del tt
except:
iss = True
if iss:
print('This model is not reliable!\n')
del iss
yhs = pd.concat((y, fit.rename('Fitted')), 1) #y & yhat #this works with updated pmdarima
# yhs = pd.concat((y, pd.Series(fit, index=y.index).rename('Fitted')), 1) #y & yhat
print(yhs) #y-hat for SARIMA/X
#plot time series vs fit and its 'line'
#plot 'regression' line
mm = np.array([min(fit.iloc[1:]), max(fit.iloc[1:])])
y0 = y[len(y)-len(fit):] #y & fit can have different lengths
pl.rcParams['figure.figsize'] = 7, 7 #square plot
# pl.rcParams['lines.linewidth'] = 1.0
# pl.plot(mm, sum(np.polyfit(fit, y0, 1) * [mm, 1]), 'red') #increase linewidth for darker line
pl.plot(mm, sum(np.polyfit(fit.iloc[1:], y0.iloc[1:], 1) * [mm, 1]), 'r', lw=1) #increase linewidth for darker line
#scatter time series vs fit
# pl.rcParams['lines.markersize'] = 2.0
# pl.scatter(ft, y0)
pl.scatter(fit.iloc[1:], y0.iloc[1:], s=3)
pl.ylabel(yn)
pl.xlabel('Fitted ' + yn)
pl.title('R² = ' + str('%.4f' % yhs.iloc[1:, :].corr().iloc[1, 0]**2) + ' for ' + str(len(y0) - 1) +
' obs by SARIMA' + ('X' if xc > 0 else '') + str(o.order) + str(o.seasonal_order)
+ (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
print(f'\033[1m' + 'SARIMA forecast for', n2, "future values of '" + yn
+ ("' using " + str(xc) + ' X-variables' if xc > 0 else "'") + '...\n' + f'\033[0m')
# yf = o.predict(n2, x2) if xc > 0 else o.predict(n2) #forecast
yf, ci = o.predict(n2, x2, return_conf_int=True) if xc > 0 else o.predict(n2, return_conf_int=True) #forecast
cs = ci[:, 0] #95% confidence interval lower limit
ce = ci[:, 1] #95% confidence interval upper limit
yf = pd.Series(yf, index=pd.date_range(y.index[-1], periods=len(yf)+1, freq=y.index.freq)[1:]).rename('Forecast')
cs = pd.Series(cs, index=pd.date_range(y.index[-1], periods=len(yf)+1, freq=y.index.freq)[1:]
).rename('95% Confidence Interval')
ce = pd.Series(ce, index=pd.date_range(y.index[-1], periods=len(yf)+1, freq=y.index.freq)[1:]
).rename('95% Confidence Interval')
if 'y2' in vars():
print(pd.concat((y2, yf), axis=1))
else:
print(yf)
pl.rcParams["figure.figsize"] = 18, 8
#plot for last 200+ training periods
nb = min(nb, len(fit) - 1)
pl.plot(y[-nb:], c='r', label=yn)
pl.plot(fit[-nb:], label='Fit')
pl.plot(fit[-1:].append(yf), c='cyan', label=yf.name)
pl.plot(fit[-1:].append(cs), c='grey', alpha=0.2, label=cs.name)
pl.plot(fit[-1:].append(ce), c='grey', alpha=0.2, label=ce.name)
# pl.legend(bbox_to_anchor=(1.005, 1.165))
pl.legend(bbox_to_anchor=(1.005, 1.235))
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.title('Last ' + str(nb) + ' observed & fitted values, & ' + str(n2) + " forecasts of '"
+ yn + "' by SARIMA" + ('X' if xc > 0 else '') + str(o.order)
+ str(o.seasonal_order) + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
#plot for last few training periods
pl.plot(y[-lt:], c='r', label=yn)
if 'y2' in vars():
pl.plot(y[-1:].append(y2), c='r', alpha=.2)
pl.plot(fit[-lt:], label='Fit')
pl.plot(fit[-1:].append(yf), c='cyan', label=yf.name)
pl.legend(bbox_to_anchor=(1.005, 1.165))
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.title('Last ' + str(lt) + ' observed & fitted values, & ' + str(n2) + " forecasts of '" + yn
+ "' by SARIMA" + ('X' if xc > 0 else '') + str(o.order) + str(o.seasonal_order)
+ (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
#diagnostic plots
try:
#error message could be "'Rectangle' object has no property 'normed'":
o.plot_diagnostics(figsize=(18, 16))
pl.xticks(rotation=90)
pl.show()
except:
#pmdarima probably not installed by pip
#clean up
fig = pl.gcf()
fig.clear()
pl.close(fig)
#plot ARIMAX & SARIMAX for last few training periods & forecasts
pl.plot(y[-lt:], c='r', lw=6, label=yn)
if 'y2' in vars():
pl.plot(y[-1:].append(y2), c='r', lw=6, alpha=.075)
del y2
pl.plot(ft[-lt:], c='b', lw=3, label='ARIMAX Fit')
pl.plot(ft[-1:].append(fc), c='b', lw=3, label='ARIMAX '+fc.name)
pl.plot(fit[-lt:], c='lime', lw=3, label='SARIMAX Fit')
pl.plot(fit[-1:].append(yf), c='lime', lw=3, label='SARIMAX '+yf.name)
pl.legend(bbox_to_anchor=(1.005, 1.235))
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.title('Observed & fitted values, & ' + str(n2) + " forecasts of '" + yn
+ "' by ARIMA" + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
+ str(q) + ') & SARIMA' + ('X' if xc > 0 else '') + str(o.order)
+ str(o.seasonal_order) + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
print('Analysis for time series "' + yn + '" took',
'%.2f' % ((time.time() - stm) / 60), 'mins.')
Dataset has 439 rows and the following columns:
['date', 'Unnamed: 0', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'FTSE_High_Price', 'US_Covid_Cases', 'cumCasesByPublishDate']
date Unnamed: 0 ... US_Covid_Cases cumCasesByPublishDate
date ...
2020-01-21 2020-01-21 0 ... 1.0 0.0
2020-01-22 2020-01-22 1 ... 1.0 0.0
2020-01-23 2020-01-23 2 ... 1.0 0.0
[3 rows x 7 columns]
date Unnamed: 0 ... US_Covid_Cases cumCasesByPublishDate
date ...
2021-04-01 2021-04-01 436 ... 30562856.0 4350266.0
2021-04-02 2021-04-02 437 ... 30631700.0 4353668.0
2021-04-03 2021-04-03 438 ... 30631700.0 4357091.0
[3 rows x 7 columns]
Fitting ARIMAX(1,1,1) for the last 425 obs of 'cumCasesByPublishDate' using 5 X-variables: ['Unnamed: 0', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'FTSE_High_Price', 'US_Covid_Cases']...
ARIMA Model Results
===================================================================================
Dep. Variable: D.cumCasesByPublishDate No. Observations: 424
Model: ARIMA(1, 1, 1) Log Likelihood -4010.121
Method: css-mle S.D. of innovations 3090.117
Date: Thu, 22 Apr 2021 AIC 8038.242
Time: 13:48:21 BIC 8074.690
Sample: 01-22-2020 HQIC 8052.643
- 03-20-2021
=================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------
const -3.764e+04 1.74e+04 -2.167 0.030 -7.17e+04 -3599.224
Unnamed: 0 220.9154 78.729 2.806 0.005 66.610 375.221
cumDailyNsoDeathsByDeathDate -1.3842 0.566 -2.445 0.014 -2.494 -0.275
cumAdmissions 0.4003 0.264 1.519 0.129 -0.116 0.917
FTSE_High_Price 4.2398 1.868 2.270 0.023 0.579 7.900
US_Covid_Cases -0.0014 0.002 -0.734 0.463 -0.005 0.002
ar.L1.D.cumCasesByPublishDate 0.9814 0.016 62.806 0.000 0.951 1.012
ma.L1.D.cumCasesByPublishDate -0.3981 0.053 -7.473 0.000 -0.503 -0.294
Roots
=============================================================================
Real Imaginary Modulus Frequency
-----------------------------------------------------------------------------
AR.1 1.0189 +0.0000j 1.0189 0.0000
MA.1 2.5119 +0.0000j 2.5119 0.0000
-----------------------------------------------------------------------------
cumCasesByPublishDate Fitted
2020-01-21 0.0 NaN
2020-01-22 0.0 -5.655161e+03
2020-01-23 0.0 -7.203103e+01
2020-01-24 0.0 1.270581e+02
2020-01-25 0.0 2.064265e+02
... ... ...
2021-03-16 4268821.0 4.269359e+06
2021-03-17 4274579.0 4.274681e+06
2021-03-18 4280882.0 4.280726e+06
2021-03-19 4285684.0 4.287477e+06
2021-03-20 4291271.0 4.291712e+06
[425 rows x 2 columns]
StdErr(Fitted) = 3090.1165059668942 (same as for 1st forecast value below)
ARIMA forecast for 14 future values of 'cumCasesByPublishDate' using 5 X-variables...
Forecast Std Error
2021-03-21 4.297564e+06 3090.116506
2021-03-22 4.304153e+06 5786.837930
2021-03-23 4.311204e+06 8824.287614
2021-03-24 4.318688e+06 12181.870015
2021-03-25 4.326658e+06 15825.771087
2021-03-26 4.335107e+06 19725.663371
2021-03-27 4.343987e+06 23855.801165
2021-03-28 4.353320e+06 28194.327247
2021-03-29 4.363027e+06 32722.490906
2021-03-30 4.373189e+06 37424.015096
2021-03-31 4.383580e+06 42284.613801
2021-04-01 4.394216e+06 47291.626298
2021-04-02 4.405146e+06 52433.737498
2021-04-03 4.416462e+06 57700.761195
Fitting Auto SARIMA(X) for the last 425 obs of 'cumCasesByPublishDate' using 5 X-variables: ['Unnamed: 0', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'FTSE_High_Price', 'US_Covid_Cases']... (this may take time)
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 425
Model: SARIMAX(3, 0, 0) Log Likelihood -4055.586
Date: Thu, 22 Apr 2021 AIC 8131.173
Time: 13:49:35 BIC 8171.694
Sample: 01-21-2020 HQIC 8147.181
- 03-20-2021
Covariance Type: opg
================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------
intercept 9.8110 466.978 0.021 0.983 -905.450 925.072
Unnamed: 0 -3274.3888 83.601 -39.167 0.000 -3438.245 -3110.533
cumDailyNsoDeathsByDeathDate 13.6807 6.856 1.995 0.046 0.243 27.118
cumAdmissions 1.9427 1.435 1.354 0.176 -0.870 4.755
FTSE_High_Price -0.1590 3.679 -0.043 0.966 -7.369 7.051
US_Covid_Cases 0.0510 0.012 4.248 0.000 0.027 0.074
ar.L1 1.6327 0.025 64.813 0.000 1.583 1.682
ar.L2 -0.3150 0.049 -6.385 0.000 -0.412 -0.218
ar.L3 -0.3183 0.031 -10.279 0.000 -0.379 -0.258
sigma2 1.609e+07 0.052 3.1e+08 0.000 1.61e+07 1.61e+07
===================================================================================
Ljung-Box (L1) (Q): 0.91 Jarque-Bera (JB): 13097.15
Prob(Q): 0.34 Prob(JB): 0.00
Heteroskedasticity (H): 15.25 Skew: -1.08
Prob(H) (two-sided): 0.00 Kurtosis: 30.11
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.02e+23. Standard errors may be unstable.
This model is not reliable!
cumCasesByPublishDate Fitted
2020-01-21 0.0 1.565201e+04
2020-01-22 0.0 -3.118081e+03
2020-01-23 0.0 -2.170499e+02
2020-01-24 0.0 -1.457686e+02
2020-01-25 0.0 -1.476194e+02
... ... ...
2021-03-16 4268821.0 4.267454e+06
2021-03-17 4274579.0 4.273232e+06
2021-03-18 4280882.0 4.279161e+06
2021-03-19 4285684.0 4.285562e+06
2021-03-20 4291271.0 4.288850e+06
[425 rows x 2 columns]
SARIMA forecast for 14 future values of 'cumCasesByPublishDate' using 5 X-variables...
2021-03-21 4.294246e+06
2021-03-22 4.297884e+06
2021-03-23 4.300882e+06
2021-03-24 4.304423e+06
2021-03-25 4.306797e+06
2021-03-26 4.308775e+06
2021-03-27 4.309409e+06
2021-03-28 4.308707e+06
2021-03-29 4.308266e+06
2021-03-30 4.306871e+06
2021-03-31 4.305313e+06
2021-04-01 4.303762e+06
2021-04-02 4.301311e+06
2021-04-03 4.294918e+06
Freq: D, Name: Forecast, dtype: float64
Analysis for time series "cumCasesByPublishDate" took 1.29 mins.
df = pd.read_csv('cleaned_data.csv')
df["date"] = pd.to_datetime(df["date"])
initvar = [i for i in list(df.columns) if i != "date"]
cdf = df[initvar]
a,b = variableSelection(cdf,"US_Covid_Cases")
a.summary2()
var_list = list(dict(a.pvalues).keys())
var_list.insert(0, "date")
var_list.append("US_Covid_Cases")
var_list.remove("const")
df = df[var_list]
## Time Series ARIMA, ARIMAX or auto SARIMAX Forecast
#only have to specify filename, yname & timestamp-name; all other columns will be taken as Xs
#Xs should first go through regression variable selection
#categorical variables will be deleted
#missing values are automatically handled
#processed input data may be written to disk, just before analysis, using the parameter writedata
#warning messages only appears for first run
#SARIMAX Seasonal AutoRegressive Integrated Moving Average with eXogenous regressors
#https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_sarimax_stata.html
import time
stm = time.time()
import os
try:
os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data") #change "\0_Teach\data" accordingly
except:
try:
os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data") #for Mac
except:
#assume data file in current folder
pass
import pandas as pd
#df = pd.read_excel(r"COE_TimeSeries_lag5.xlsx", index_col=0) #ARIMAX
#in the target (y) column, leave tail periods to be forecasted blank
#older python does not read .xlsx:
# #df = pd.read_excel(r"COE_TimeSeries_lag5.xlsx").iloc[:, :2] #ARIMA, only 2 columns needed
# df = pd.read_excel(r"COE_TimeSeries_lag5.xlsx") #ARIMAX
# tn = 'Date' #time-stamp column, for index (row labels)
# #the index (row labels) must not have any NaN
# yn = 'CatECOE'#time series column
# pc = 24 #periodicity; randomly assumed here; e.g. weekly data would be 7, monthly 12, quarterly 4
# #input y2 for testing set if available; else, comment out next 3 lines:
# y2 = [44000, 39903, 38801, 39000, 38000] #n2 actual future values kept for comparing with forecasts
# df = pd.read_csv('SARIMAX_grid-search_prepared_data.csv')
# df.drop(['FTSE100 Close', 'Bitcoin Price', 'Crude Oil Price'], axis=1, inplace=True)
tn = 'date' #time serial column name
yn = "US_Covid_Cases"
pc = 7
# y2 = [1643, 1875, 1938, 1986, 1944, 1982, 1592] #n2 actual future values kept for comparing with forecasts
#if frequency not inferred correctly, then freq=None :
df.index = pd.to_datetime(df[tn], infer_datetime_format=True)
#df.index = pd.DatetimeIndex(df[tn], freq=pd.infer_freq(df[tn]))
# f = 'SMS-16' #frequency specified; might be changed by inference
# if df.index.freq == None:
# #reset freq to inferred if possible
# #warning: this frequently fails!
# z = pd.infer_freq(df.index)
# #print('Inferred freq =', z, '\n')
# if z != None:
# if f != 'SMS-16' or z != '15D':
# print('Setting freq to inferred "' + z + '".\n')
# f = z
# elif f == '':
# f = z
# else:
# f = df.index.freq
# df.index.freq = f
print('Dataset has', len(df), 'rows and the following columns:')
print(list(df), '\n')
import numpy as np
# #artificially set some NaNs at the start and in the middle for y:
# df.iloc[:4, list(df).index(yn)] = np.nan
# df.iloc[10:14, list(df).index(yn)] = np.nan
# print(df.iloc[:15, list(df).index(yn)])
y = pd.DataFrame(df[yn])
s = list(y.index).index(y.isnull().ne(True).idxmax()[0]) + 1 #position of first non-missing at the start
print(df.head(max(s, 3))) #show all of any missing at the start
y.interpolate(limit_area='inside', inplace=True) #fill in missing in the middle
e = max((len(y) - y.count())[0] - s + 2, 3)
print(df.tail(e)) #shows all of any missing at the end
#if not time series, to remove any row with NaN, inf or -inf: df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
#delete any row with missing y at the start:
df = df.select_dtypes(include='number').iloc[s - 1:, :] #only use numeric columns
df.interpolate(limit_area='inside', inplace=True) #interpolate, but not extrapolate for either end
# x = df.drop(yn, 1)
# df.drop(yn, 1).interpolate(limit_direction='both', inplace=True) #fill in missing at the beginning & the end
#fill in missing at the beginning & the end:
df = pd.concat([y.iloc[s - 1:, :], df.drop(yn, 1).interpolate(limit_direction='both')], 1)
#also df.bfill() & df.ffill()
writedata = False #write processed data to disk
if writedata:
kname = 'auto_SARIMAX_prepared_data.csv'
if os.path.exists(kname):
os.remove(kname)
pd.concat([y, x], 1).to_csv(kname)
print('\n' + kname, 'written to disk.')
del kname
del os, writedata
if 'y2' in vars():
#using df.index was inspired by Ong Jun Hong:
y2 = pd.Series(y2, index=pd.date_range(df.index[-(len(df[yn]) - df[yn].count() + 1)], periods=len(y2)+1,
freq=df.index.freq)[1:]).rename(yn + ' (unused)')
#done preparing time series
n0 = int(df[yn].count()) #number of non-Nan values; type(df[yn].count())=<class 'numpy.int32'>
n2 = len(df) - n0 #periods to forecast ahead
### CHanged
n2 = 14
n2 = pc if n2 == 0 else n2
n = min(3653, n0) #use only the last 3653 observations
if n != n0:
print('Only the last', n, 'of', n0, 'observations will be used for fitting.\n')
n1 = min(len(df[yn]), n + n2)
y = df[yn][-n1:-n2] #last 5 rows have no row labels
x = df.iloc[-n1:-n2, 1:] #for fitting in-sample values
x2 = df.iloc[-n2:, 1:] #for forecasting future values
xc = x.shape[1] #number of X variables
#fit ARIMA to time series
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>"))
p, d, q = 1, 1, 1
print(f'\033[1m' + '\nFitting ARIMA' + ('X' if xc > 0 else '') + '(' + str(p) + ',' + str(d) + ',' + str(q)
+ ') for the last', len(y), "obs of '" + yn + ("' using " + str(xc) + ' X-variables: '
+ str(list(x)) if xc > 0 else "'")
+ '...\n' + f'\033[0m') #in bold
from statsmodels.tsa.arima_model import ARIMA
# from statsmodels.tsa.arima.model import ARIMA
#https://www.statsmodels.org/stable/generated/statsmodels.tsa.arima.model.ARIMA.html
#https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMA.fit.html
#out = ARIMA(y, order=(1,1,1), missing='raise').fit(trend='c', transparams=True)
out = ARIMA(y, (p,d,q), x).fit() if xc > 0 else ARIMA(y, (p,d,q)).fit()
# out = ARIMA(y, x, (p,d,q)).fit() if xc > 0 else ARIMA(y, (p,d,q)).fit()
#(p,d,q)=(1,1,1) is only as an example here
#print(dir(out))
print(out.summary(), '\n')
#check reliability of model:
tt = out.tvalues #z-values
if min(tt) > -np.inf and max(tt) < np.inf:
tt = out.summary().as_text()
tt = tt[tt.find('Kurtosis') - 7:][:4] #Prob(H) (two-sided)
if tt == ' nan' or tt == '1.00' or tt == '0.00':
iss = True
else:
iss = False
else:
iss = True
if iss:
print('This model is not reliable!\n')
del tt, iss
#https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMAResults.predict.html
#ft = out.fittedvalues #fitted values for d-integrated(differenced) series, not original series
ft = out.predict(typ='levels') #without 'levels', fitted output will be for d-differenced series
yh = pd.concat((y, ft.rename('Fitted')), 1) #y & yhat
print(yh)
#plot time series vs fit and its 'line'
#plot 'regression' line
mm = np.array([min(ft), max(ft)])
y0 = y[len(y)-len(ft):] #y & ft can have different lengths
import matplotlib.pyplot as pl
%matplotlib inline
pl.rcParams['figure.figsize'] = 7, 7 #square plot
# pl.rcParams['lines.linewidth'] = 1.0
# pl.plot(mm, sum(np.polyfit(ft, y0, 1) * [mm, 1]), 'red') #increase linewidth for darker line
pl.plot(mm, sum(np.polyfit(ft, y0, 1) * [mm, 1]), 'r', lw=1) #increase linewidth for darker line
#scatter time series vs fit
# pl.rcParams['lines.markersize'] = 2.0
# pl.scatter(ft, y0)
pl.scatter(ft, y0, s=3)
pl.ylabel(yn)
pl.xlabel('Fitted ' + yn)
pl.title('R² = ' + str('%.4f' % yh.corr().iloc[1, 0]**2) + ' for ' + str(len(y0)) +
' obs by ARIMA' + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
+ str(q) + ')' + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
#print('forecast error:', out._forecast_error) #doesn't work
# out.sigma2 is a bit different from sum((y-ft).dropna()**2)/len(ft)
print('StdErr(Fitted) =', np.sqrt(out.sigma2), '(same as for 1st forecast value below)' '\n\n')
# print('StdErr(Fitted) =', np.sqrt(out.mse), '(same as for 1st forecast value below)' '\n\n')
#forecast future values
#future 2.5% or 5 periods of time series:
#stnd = pd.date_range(y.index[-1], periods=max(1, min(n2, len(y)//40)) + 1, freq=y.index.freq)
stnd = pd.date_range(y.index[-1], periods=n2+1, freq=y.index.freq)
print(f'\033[1m' + 'ARIMA forecast for', n2, "future values of '" + yn
+ ("' using " + str(xc) + ' X-variables' if xc > 0 else "'") + '...\n' + f'\033[0m')
#https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMAResults.forecast.html
fc, stderr, confint = out.forecast(n2, x2) if xc > 0 else out.forecast(n2)
# print(pd.concat((pd.Series(fc).rename('Forecast'), pd.Series(stderr).rename('Std Error')), 1
# ).set_index(stnd[1:])) #also works
# print(pd.concat((pd.Series(fc, name='Forecast'), pd.Series(stderr, name='Std Error')), 1
# ).set_index(stnd[1:])) #also works
print(pd.concat((pd.Series(fc), pd.Series(stderr)), 1, keys=['Forecast', 'Std Error']
).set_index(stnd[1:]))
fc = pd.Series(fc, index=pd.date_range(ft.index[-1], periods=len(fc)+1,
freq=ft.index.freq)[1:]).rename('Forecast')
#plot last 200+ training periods' observed & fitted, & forecast
nb = -(-200 // pc * pc) if pc > 1 else 200 #last training periods to plot
pl.rcParams['figure.figsize'] = 18, 8 #use the whole width of window
#https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html
pl.title('Last ' + str(nb) + ' observed & fitted values, & ' + str(n2) + " forecasts of '"
+ yn + "' by ARIMA" + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
+ str(q) + ')' + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.plot(ft[-nb:], label='Fit')
pl.plot(ft[-1:].append(fc), c='cyan', label=fc.name)
pl.plot(y[-nb:], label=yn)
#https://matplotlib.org/api/_as_gen/matplotlib.pyplot.legend.html
# #place legend precisely for 18x8 plot & 2 series; might cover title
# pl.legend(bbox_to_anchor=(1.005, 1.087))
pl.legend(bbox_to_anchor=(1.005, 1.165))
#pl.legend(loc='best') #avoid plotted space within the frame, but cpu-intensive
#pl.legend(loc='upper left')
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.show()
#plot last few training periods' observed & fitted, & forecast
#last 10% or 24 periods of time series:
lt = max(2, min(max(2 * pc, 24), (len(y) - 1) // 10))
start = y.index[-lt]
#future 2.5% or 5 periods of time series:
end = stnd[-1]
# #https://statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARMAResults.plot_predict.html
# out.plot_predict(start, end, x2) if xc > 0 else out.plot_predict(start, end)
# pl.plot(ft[-1:].append(fc), c='cyan')
# pl.legend(bbox_to_anchor=(1.005, 1.180)) #place legend precisely for 18x8 plot to avoid title
# #pl.legend(loc='upper left') #relocate the legend
# #pl.xticks(rotation=90, ha='center', rotation_mode='anchor') #center labels at tick marks
# pl.xticks(rotation=90, ha='center') #center labels at tick marks
# pl.xlabel(y.index.name)
# pl.ylabel(yn)
# pl.title('Last ' + str(lt) + ' observed & fitted values, & ' + str(n2) + " forecasts of '"
# + yn + "' by ARIMA" + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
# + str(q) + ')' + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
# pl.show()
#Fit SARIMAX Seasonal AutoRegressive Integrated Moving Average with X variables
import warnings
#suppress warning messages (those with peach background)
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARMA',
FutureWarning)
warnings.filterwarnings('ignore', 'statsmodels.tsa.arima_model.ARIMA',
FutureWarning)
#do below only once, to install pmdarima
#alkaline-ml no longer available:
#!conda install -c alkaline-ml pmdarima --y
#for python before 3.8:
#!conda install -c Saravji pmdarima --y
#as last resort:
#pip install pmdarima
#https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.sarimax.SARIMAX.html
#https://www.statsmodels.org/devel/examples/notebooks/generated/statespace_sarimax_pymc3.html
#https://stackoverflow.com/questions/22770352/auto-arima-equivalent-for-python
#https://iprokin.github.io/posts/2017-01-08-idARIMA.html
try:
#https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html
from pmdarima.arima import auto_arima #wrapper around statsmodels arima
except:
import six
import sys
sys.modules['sklearn.externals.six'] = six
import joblib
sys.modules['sklearn.externals.joblib'] = joblib
from pmdarima.arima import auto_arima #wrapper around statsmodels arima
print(f'\033[1m' + '\nFitting Auto SARIMA(X) for the last', len(y), "obs of '" + yn +
("' using " + str(xc) + ' X-variables: ' + str(list(x)) if xc > 0 else "'")
+ '...' + f'\033[0m', '(this may take time)\n') #in bold
#o = auto_arima(y) #auto ARIMA; not SARIMA
#1 = no seasonality; 7 for daily, 4 for quarterly, 12 for monthly:
if xc > 0:
#auto SARIMAX
o = auto_arima(y, x, start_p=0, start_q=0, max_p=pc, max_d=3, max_q=pc, #for y
start_P=0, start_Q=0, max_P=pc, max_D=3, max_Q=pc, #for seasonal
max_order=None, #p+q+P+Q; None means no constraints on maximum order
#o = auto_arima(y, x,
# m=1 if pc == 24 else pc, #'m != 1' makes this code run for longer
m=pc, #'m != 1' makes this code run for longer
suppress_warnings=True, error_action='ignore', n_jobs=-1)
#n_jobs=-1 above tries to use all CPU threads
fit = o.predict_in_sample(x)
if type(fit) == np.ndarray:
fit = pd.Series(fit, index=y.index)
else:
#auto SARIMA
o = auto_arima(y, start_p=0, start_q=0, max_p=pc, max_d=3, max_q=pc, #for y
start_P=0, start_Q=0, max_P=pc, max_D=3, max_Q=pc, #for seasonal
max_order=None, #p+q+P+Q; None means no constraints on maximum order
#o = auto_arima(y,
# m=1 if pc == 24 else pc, #'m != 1' makes this code run for longer
m=pc, #'m != 1' makes this code run for longer
suppress_warnings=True, error_action='ignore', n_jobs=-1)
fit = o.predict_in_sample() #fit
#fit is numpy array when no x
fit = pd.Series(fit, index=pd.date_range(y.index[-len(fit)], periods=len(fit),
freq=y.index.freq))
# print(dir(o), '\n')
# ['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__',
# '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__',
# '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__',
# '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__',
# '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_exog',
# '_clear_cached_state', '_fit', '_get_param_names', '_get_tags', '_legacy_set_state',
# '_more_tags', '_warn_for_older_version', 'aic', 'aicc', 'arima_res_', 'arparams',
# 'arroots', 'bic', 'bse', 'conf_int', 'df_model', 'df_resid', 'fit', 'fit_predict',
# 'fit_with_exog_', 'get_params', 'hqic', 'maparams', 'maroots', 'maxiter', 'method',
# 'nobs_', 'oob', 'oob_', 'oob_preds_', 'order', 'out_of_sample_size', 'params',
# 'pkg_version_', 'plot_diagnostics', 'predict', 'predict_in_sample', 'pvalues', 'resid',
# 'sarimax_kwargs', 'scoring', 'scoring_args', 'seasonal_order', 'set_params',
# 'start_params', 'summary', 'suppress_warnings', 'to_dict', 'trend', 'update',
# 'with_intercept']
# print(o.scoring, '\n')
print(o.summary(), '\n')
#http://medium.com/@josemarcialportilla/using-python-and-auto-arima-to-forecast-seasonal-time-series-90877adff03c
#check reliability of model:
try:
tt = o.params()/o.bse() #z-values
if min(tt) > -np.inf and max(tt) < np.inf:
tt = o.summary().as_text()
tt = tt[tt.find('Kurtosis') - 7:][:4] #Prob(H) (two-sided)
if tt == ' nan' or tt == '1.00' or tt == '0.00':
iss = True
else:
iss = False
else:
iss = True
del tt
except:
iss = True
if iss:
print('This model is not reliable!\n')
del iss
yhs = pd.concat((y, fit.rename('Fitted')), 1) #y & yhat #this works with updated pmdarima
# yhs = pd.concat((y, pd.Series(fit, index=y.index).rename('Fitted')), 1) #y & yhat
print(yhs) #y-hat for SARIMA/X
#plot time series vs fit and its 'line'
#plot 'regression' line
mm = np.array([min(fit.iloc[1:]), max(fit.iloc[1:])])
y0 = y[len(y)-len(fit):] #y & fit can have different lengths
pl.rcParams['figure.figsize'] = 7, 7 #square plot
# pl.rcParams['lines.linewidth'] = 1.0
# pl.plot(mm, sum(np.polyfit(fit, y0, 1) * [mm, 1]), 'red') #increase linewidth for darker line
pl.plot(mm, sum(np.polyfit(fit.iloc[1:], y0.iloc[1:], 1) * [mm, 1]), 'r', lw=1) #increase linewidth for darker line
#scatter time series vs fit
# pl.rcParams['lines.markersize'] = 2.0
# pl.scatter(ft, y0)
pl.scatter(fit.iloc[1:], y0.iloc[1:], s=3)
pl.ylabel(yn)
pl.xlabel('Fitted ' + yn)
pl.title('R² = ' + str('%.4f' % yhs.iloc[1:, :].corr().iloc[1, 0]**2) + ' for ' + str(len(y0) - 1) +
' obs by SARIMA' + ('X' if xc > 0 else '') + str(o.order) + str(o.seasonal_order)
+ (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
print(f'\033[1m' + 'SARIMA forecast for', n2, "future values of '" + yn
+ ("' using " + str(xc) + ' X-variables' if xc > 0 else "'") + '...\n' + f'\033[0m')
# yf = o.predict(n2, x2) if xc > 0 else o.predict(n2) #forecast
yf, ci = o.predict(n2, x2, return_conf_int=True) if xc > 0 else o.predict(n2, return_conf_int=True) #forecast
cs = ci[:, 0] #95% confidence interval lower limit
ce = ci[:, 1] #95% confidence interval upper limit
yf = pd.Series(yf, index=pd.date_range(y.index[-1], periods=len(yf)+1, freq=y.index.freq)[1:]).rename('Forecast')
cs = pd.Series(cs, index=pd.date_range(y.index[-1], periods=len(yf)+1, freq=y.index.freq)[1:]
).rename('95% Confidence Interval')
ce = pd.Series(ce, index=pd.date_range(y.index[-1], periods=len(yf)+1, freq=y.index.freq)[1:]
).rename('95% Confidence Interval')
if 'y2' in vars():
print(pd.concat((y2, yf), axis=1))
else:
print(yf)
pl.rcParams["figure.figsize"] = 18, 8
#plot for last 200+ training periods
nb = min(nb, len(fit) - 1)
pl.plot(y[-nb:], c='r', label=yn)
pl.plot(fit[-nb:], label='Fit')
pl.plot(fit[-1:].append(yf), c='cyan', label=yf.name)
pl.plot(fit[-1:].append(cs), c='grey', alpha=0.2, label=cs.name)
pl.plot(fit[-1:].append(ce), c='grey', alpha=0.2, label=ce.name)
# pl.legend(bbox_to_anchor=(1.005, 1.165))
pl.legend(bbox_to_anchor=(1.005, 1.235))
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.title('Last ' + str(nb) + ' observed & fitted values, & ' + str(n2) + " forecasts of '"
+ yn + "' by SARIMA" + ('X' if xc > 0 else '') + str(o.order)
+ str(o.seasonal_order) + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
#plot for last few training periods
pl.plot(y[-lt:], c='r', label=yn)
if 'y2' in vars():
pl.plot(y[-1:].append(y2), c='r', alpha=.2)
pl.plot(fit[-lt:], label='Fit')
pl.plot(fit[-1:].append(yf), c='cyan', label=yf.name)
pl.legend(bbox_to_anchor=(1.005, 1.165))
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.title('Last ' + str(lt) + ' observed & fitted values, & ' + str(n2) + " forecasts of '" + yn
+ "' by SARIMA" + ('X' if xc > 0 else '') + str(o.order) + str(o.seasonal_order)
+ (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
#diagnostic plots
try:
#error message could be "'Rectangle' object has no property 'normed'":
o.plot_diagnostics(figsize=(18, 16))
pl.xticks(rotation=90)
pl.show()
except:
#pmdarima probably not installed by pip
#clean up
fig = pl.gcf()
fig.clear()
pl.close(fig)
#plot ARIMAX & SARIMAX for last few training periods & forecasts
pl.plot(y[-lt:], c='r', lw=6, label=yn)
if 'y2' in vars():
pl.plot(y[-1:].append(y2), c='r', lw=6, alpha=.075)
del y2
pl.plot(ft[-lt:], c='b', lw=3, label='ARIMAX Fit')
pl.plot(ft[-1:].append(fc), c='b', lw=3, label='ARIMAX '+fc.name)
pl.plot(fit[-lt:], c='lime', lw=3, label='SARIMAX Fit')
pl.plot(fit[-1:].append(yf), c='lime', lw=3, label='SARIMAX '+yf.name)
pl.legend(bbox_to_anchor=(1.005, 1.235))
pl.xticks(rotation=90)
pl.xlabel(y.index.name)
pl.ylabel(yn)
pl.title('Observed & fitted values, & ' + str(n2) + " forecasts of '" + yn
+ "' by ARIMA" + ('X(' if xc > 0 else '(') + str(p) + ',' + str(d) + ','
+ str(q) + ') & SARIMA' + ('X' if xc > 0 else '') + str(o.order)
+ str(o.seasonal_order) + (' with ' + str(xc) + ' X-variables' if xc > 0 else ''))
pl.show()
print('Analysis for time series "' + yn + '" took',
'%.2f' % ((time.time() - stm) / 60), 'mins.')
Dataset has 439 rows and the following columns:
['date', 'cumCasesByPublishDate', 'cumVirusTests', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'US_Covid_Deaths', 'US_Covid_Cases']
date cumCasesByPublishDate ... US_Covid_Deaths US_Covid_Cases
date ...
2020-01-21 2020-01-21 0.0 ... 0.0 1.0
2020-01-22 2020-01-22 0.0 ... 0.0 1.0
2020-01-23 2020-01-23 0.0 ... 0.0 1.0
[3 rows x 7 columns]
date cumCasesByPublishDate ... US_Covid_Deaths US_Covid_Cases
date ...
2021-04-01 2021-04-01 4350266.0 ... 552593.0 30562856.0
2021-04-02 2021-04-02 4353668.0 ... 553554.0 30631700.0
2021-04-03 2021-04-03 4357091.0 ... 553554.0 30631700.0
[3 rows x 7 columns]
Fitting ARIMAX(1,1,1) for the last 425 obs of 'US_Covid_Cases' using 5 X-variables: ['cumCasesByPublishDate', 'cumVirusTests', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'US_Covid_Deaths']...
ARIMA Model Results
==============================================================================
Dep. Variable: D.US_Covid_Cases No. Observations: 424
Model: ARIMA(1, 1, 1) Log Likelihood -4708.848
Method: css-mle S.D. of innovations 16058.461
Date: Thu, 22 Apr 2021 AIC 9435.697
Time: 13:49:38 BIC 9472.144
Sample: 01-22-2020 HQIC 9450.097
- 03-20-2021
================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------
const -1.967e+04 4.52e+04 -0.436 0.663 -1.08e+05 6.88e+04
cumCasesByPublishDate -0.1290 0.052 -2.458 0.014 -0.232 -0.026
cumVirusTests -0.0039 0.001 -2.718 0.007 -0.007 -0.001
cumDailyNsoDeathsByDeathDate -12.2481 2.719 -4.505 0.000 -17.577 -6.919
cumAdmissions 4.3999 1.245 3.534 0.000 1.960 6.840
US_Covid_Deaths 1.6866 0.590 2.860 0.004 0.531 2.842
ar.L1.D.US_Covid_Cases 0.9878 0.010 102.671 0.000 0.969 1.007
ma.L1.D.US_Covid_Cases -0.5855 0.039 -14.972 0.000 -0.662 -0.509
Roots
=============================================================================
Real Imaginary Modulus Frequency
-----------------------------------------------------------------------------
AR.1 1.0123 +0.0000j 1.0123 0.0000
MA.1 1.7081 +0.0000j 1.7081 0.0000
-----------------------------------------------------------------------------
US_Covid_Cases Fitted
2020-01-21 1.0 NaN
2020-01-22 1.0 -1.934257e+04
2020-01-23 1.0 -1.387433e+03
2020-01-24 2.0 -5.343648e+02
2020-01-25 3.0 -1.933498e+02
... ... ...
2021-03-16 29573782.0 2.956461e+07
2021-03-17 29632723.0 2.961836e+07
2021-03-18 29693583.0 2.968016e+07
2021-03-19 29754181.0 2.974568e+07
2021-03-20 29808812.0 2.981013e+07
[425 rows x 2 columns]
StdErr(Fitted) = 16058.461379398015 (same as for 1st forecast value below)
ARIMA forecast for 14 future values of 'US_Covid_Cases' using 5 X-variables...
Forecast Std Error
2021-03-21 2.985858e+07 16058.461379
2021-03-22 2.990536e+07 27658.741557
2021-03-23 2.995118e+07 40004.069274
2021-03-24 2.999342e+07 53289.891554
2021-03-25 3.003306e+07 67502.446253
2021-03-26 3.007256e+07 82589.089302
2021-03-27 3.011228e+07 98491.468265
2021-03-28 3.014720e+07 115153.866658
2021-03-29 3.017888e+07 132525.226135
2021-03-30 3.020896e+07 150559.312561
2021-03-31 3.023545e+07 169214.332487
2021-04-01 3.026315e+07 188452.422241
2021-04-02 3.029219e+07 208239.149370
2021-04-03 3.032095e+07 228543.068650
Fitting Auto SARIMA(X) for the last 425 obs of 'US_Covid_Cases' using 5 X-variables: ['cumCasesByPublishDate', 'cumVirusTests', 'cumDailyNsoDeathsByDeathDate', 'cumAdmissions', 'US_Covid_Deaths']... (this may take time)
SARIMAX Results
==============================================================================================
Dep. Variable: y No. Observations: 425
Model: SARIMAX(2, 0, 1)x(2, 0, [1, 2], 7) Log Likelihood -4860.703
Date: Thu, 22 Apr 2021 AIC 9749.407
Time: 13:50:58 BIC 9806.136
Sample: 01-21-2020 HQIC 9771.818
- 03-20-2021
Covariance Type: opg
================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------
intercept 407.9077 1282.038 0.318 0.750 -2104.841 2920.657
cumCasesByPublishDate 0.6038 0.675 0.894 0.371 -0.720 1.927
cumVirusTests 0.0308 0.028 1.099 0.272 -0.024 0.086
cumDailyNsoDeathsByDeathDate -196.7285 44.202 -4.451 0.000 -283.364 -110.093
cumAdmissions 48.1934 7.986 6.035 0.000 32.542 63.845
US_Covid_Deaths 12.6263 4.996 2.527 0.011 2.835 22.418
ar.L1 1.9878 0.023 87.410 0.000 1.943 2.032
ar.L2 -0.9883 0.023 -43.107 0.000 -1.033 -0.943
ma.L1 -0.5345 0.100 -5.333 0.000 -0.731 -0.338
ar.S.L7 0.6718 1.064 0.631 0.528 -1.414 2.758
ar.S.L14 0.2396 0.827 0.290 0.772 -1.382 1.861
ma.S.L7 -0.3633 1.093 -0.332 0.740 -2.506 1.779
ma.S.L14 -0.3718 0.578 -0.643 0.520 -1.505 0.762
sigma2 1.021e+09 0.001 9.35e+11 0.000 1.02e+09 1.02e+09
===================================================================================
Ljung-Box (L1) (Q): 4.76 Jarque-Bera (JB): 1231.98
Prob(Q): 0.03 Prob(JB): 0.00
Heteroskedasticity (H): 2.97 Skew: 0.56
Prob(H) (two-sided): 0.00 Kurtosis: 11.27
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.09e+29. Standard errors may be unstable.
This model is not reliable!
US_Covid_Cases Fitted
2020-01-21 1.0 9.710318e+06
2020-01-22 1.0 5.482842e+03
2020-01-23 1.0 3.109224e+03
2020-01-24 2.0 4.274057e+03
2020-01-25 3.0 5.154498e+03
... ... ...
2021-03-16 29573782.0 2.955802e+07
2021-03-17 29632723.0 2.962447e+07
2021-03-18 29693583.0 2.969329e+07
2021-03-19 29754181.0 2.973800e+07
2021-03-20 29808812.0 2.979461e+07
[425 rows x 2 columns]
SARIMA forecast for 14 future values of 'US_Covid_Cases' using 5 X-variables...
2021-03-21 2.988318e+07
2021-03-22 2.993875e+07
2021-03-23 2.997752e+07
2021-03-24 3.004819e+07
2021-03-25 3.010050e+07
2021-03-26 3.013435e+07
2021-03-27 3.015370e+07
2021-03-28 3.019156e+07
2021-03-29 3.020105e+07
2021-03-30 3.019103e+07
2021-03-31 3.019656e+07
2021-04-01 3.015919e+07
2021-04-02 3.012228e+07
2021-04-03 3.007486e+07
Freq: D, Name: Forecast, dtype: float64
Analysis for time series "US_Covid_Cases" took 1.38 mins.
#Time Series Exponential, Holt's, Winters' Forecasts
import os
import pandas as pd
from statsmodels.tsa.api import ExponentialSmoothing
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>"))
import matplotlib.pyplot as pl
%matplotlib inline
#https://matplotlib.org/tutorials/introductory/customizing.html
pl.rcParams['figure.figsize'] = 18, 8
pl.rcParams['lines.linewidth'] = 1
pl.rcParams['lines.markersize'] = 1
import warnings
warnings.simplefilter('ignore')
# if os.name == 'nt':
# #Windows:
# os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data")
# else:
# #Mac:
# os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data")
# d = pd.read_csv(r"airline-passengers.csv")
df = pd.read_csv("cleaned_data.csv")
d = df[["date", "cumCasesByPublishDate"]]
print('Original dataset:\n\n', d)
d.index = pd.DatetimeIndex(d.iloc[:, 0], freq='infer')
p = 14 #periodicity or period length
d = d.iloc[:, 1:] #remove row labels, i.e. index
y = d.iloc[:, 0] #keep only 1st column; here same as d
print('\nInferred time series:\n')
print(y)
def snf(m, o, p, y):
# m=method, o=output from fit(), p=periodicity, y=time series
#output summary & fit
print(2 * '\n', f'\033[1m')
print(m, 'Forecast:', f'\033[0m', '\n')
print(o.summary())
#ix = pd.DatetimeIndex(f.index.append(f2.index), freq=f.index.freqstr) #works but not used
#ix = pd.date_range(f.index[0], f2.index[-1], freq=f.index.freqstr) #works identically but not used
f = pd.concat([o.fittedvalues, o.forecast(p)], axis=0)
f.index.name = y.index.name
f.name = y.name
print('\n' + m, 'last few fitted & forecast values:\n')
print(f.tail(p + 3))
pl.title(y.name + ' (in red) & ' + m + ' fitted values (in blue)')
pl.plot(y, color='red')
pl.plot(f)
pl.show()
#o = ExponentialSmoothing(y.astype('double'), freq=y.index.freq, seasonal_periods=12, missing='raise')
o = ExponentialSmoothing(y, freq=y.index.freqstr, seasonal_periods=p).fit()
snf('Simple Exponential', o, p, y)
o = ExponentialSmoothing(y, freq=y.index.freqstr, trend='add', seasonal_periods=p).fit()
snf("Holt's", o, p, y)
# Use additive model as we have rows with 0 values!
o = ExponentialSmoothing(y, freq=y.index.freqstr, trend='add', seasonal='add', seasonal_periods=p).fit()
snf("Winters'", o, p, y)
Original dataset:
date cumCasesByPublishDate
0 2020-01-21 0.0
1 2020-01-22 0.0
2 2020-01-23 0.0
3 2020-01-24 0.0
4 2020-01-25 0.0
.. ... ...
434 2021-03-30 4341736.0
435 2021-03-31 4345788.0
436 2021-04-01 4350266.0
437 2021-04-02 4353668.0
438 2021-04-03 4357091.0
[439 rows x 2 columns]
Inferred time series:
date
2020-01-21 0.0
2020-01-22 0.0
2020-01-23 0.0
2020-01-24 0.0
2020-01-25 0.0
...
2021-03-30 4341736.0
2021-03-31 4345788.0
2021-04-01 4350266.0
2021-04-02 4353668.0
2021-04-03 4357091.0
Freq: D, Name: cumCasesByPublishDate, Length: 439, dtype: float64
Simple Exponential Forecast:
ExponentialSmoothing Model Results
=================================================================================
Dep. Variable: cumCasesByPublishDate No. Observations: 439
Model: ExponentialSmoothing SSE 125170618534.056
Optimized: True AIC 8550.647
Trend: None BIC 8558.816
Seasonal: None AICC 8550.739
Seasonal Periods: None Date: Thu, 22 Apr 2021
Box-Cox: False Time: 17:01:37
Box-Cox Coeff.: None
==============================================================================
coeff code optimized
------------------------------------------------------------------------------
smoothing_level 0.9950000 alpha True
initial_level 0.000000 l.0 True
------------------------------------------------------------------------------
Simple Exponential last few fitted & forecast values:
date
2021-04-01 4.345768e+06
2021-04-02 4.350244e+06
2021-04-03 4.353651e+06
2021-04-04 4.357074e+06
2021-04-05 4.357074e+06
2021-04-06 4.357074e+06
2021-04-07 4.357074e+06
2021-04-08 4.357074e+06
2021-04-09 4.357074e+06
2021-04-10 4.357074e+06
2021-04-11 4.357074e+06
2021-04-12 4.357074e+06
2021-04-13 4.357074e+06
2021-04-14 4.357074e+06
2021-04-15 4.357074e+06
2021-04-16 4.357074e+06
2021-04-17 4.357074e+06
Freq: D, Name: cumCasesByPublishDate, dtype: float64
Holt's Forecast:
ExponentialSmoothing Model Results
=================================================================================
Dep. Variable: cumCasesByPublishDate No. Observations: 439
Model: ExponentialSmoothing SSE 4132883485.652
Optimized: True AIC 7057.349
Trend: Additive BIC 7073.687
Seasonal: None AICC 7057.543
Seasonal Periods: None Date: Thu, 22 Apr 2021
Box-Cox: False Time: 17:01:37
Box-Cox Coeff.: None
==============================================================================
coeff code optimized
------------------------------------------------------------------------------
smoothing_level 0.9950000 alpha True
smoothing_trend 0.6159524 beta True
initial_level 0.000000 l.0 True
initial_trend 0.000000 b.0 True
------------------------------------------------------------------------------
Holt's last few fitted & forecast values:
date
2021-04-01 4.349900e+06
2021-04-02 4.354600e+06
2021-04-03 4.357437e+06
2021-04-04 4.360645e+06
2021-04-05 4.364197e+06
2021-04-06 4.367750e+06
2021-04-07 4.371302e+06
2021-04-08 4.374855e+06
2021-04-09 4.378407e+06
2021-04-10 4.381959e+06
2021-04-11 4.385512e+06
2021-04-12 4.389064e+06
2021-04-13 4.392616e+06
2021-04-14 4.396169e+06
2021-04-15 4.399721e+06
2021-04-16 4.403273e+06
2021-04-17 4.406826e+06
Freq: D, Name: cumCasesByPublishDate, dtype: float64
Winters' Forecast:
ExponentialSmoothing Model Results
=================================================================================
Dep. Variable: cumCasesByPublishDate No. Observations: 439
Model: ExponentialSmoothing SSE 4133855926.084
Optimized: True AIC 7085.452
Trend: Additive BIC 7158.973
Seasonal: Additive AICC 7087.461
Seasonal Periods: 14 Date: Thu, 22 Apr 2021
Box-Cox: False Time: 17:01:38
Box-Cox Coeff.: None
=================================================================================
coeff code optimized
---------------------------------------------------------------------------------
smoothing_level 0.9950000 alpha True
smoothing_trend 0.6041071 beta True
smoothing_seasonal 0.0001 gamma True
initial_level 1.2148e+06 l.0 True
initial_trend 0.3979592 b.0 True
initial_seasons.0 -1.2148e+06 s.0 True
initial_seasons.1 -1.2148e+06 s.1 True
initial_seasons.2 -1.2148e+06 s.2 True
initial_seasons.3 -1.2148e+06 s.3 True
initial_seasons.4 -1.2148e+06 s.4 True
initial_seasons.5 -1.2148e+06 s.5 True
initial_seasons.6 -1.2148e+06 s.6 True
initial_seasons.7 -1.2148e+06 s.7 True
initial_seasons.8 -1.2148e+06 s.8 True
initial_seasons.9 -1.2148e+06 s.9 True
initial_seasons.10 -1.2148e+06 s.10 True
initial_seasons.11 -1.2148e+06 s.11 True
initial_seasons.12 -1.2148e+06 s.12 True
initial_seasons.13 -1.2148e+06 s.13 True
---------------------------------------------------------------------------------
Winters' last few fitted & forecast values:
date
2021-04-01 4.349901e+06
2021-04-02 4.354596e+06
2021-04-03 4.357446e+06
2021-04-04 4.360654e+06
2021-04-05 4.364214e+06
2021-04-06 4.367778e+06
2021-04-07 4.371343e+06
2021-04-08 4.374902e+06
2021-04-09 4.378468e+06
2021-04-10 4.382026e+06
2021-04-11 4.385587e+06
2021-04-12 4.389149e+06
2021-04-13 4.392710e+06
2021-04-14 4.396275e+06
2021-04-15 4.399835e+06
2021-04-16 4.403396e+06
2021-04-17 4.406956e+06
Freq: D, Name: cumCasesByPublishDate, dtype: float64
#Time Series Exponential, Holt's, Winters' Forecasts
import os
import pandas as pd
from statsmodels.tsa.api import ExponentialSmoothing
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>"))
import matplotlib.pyplot as pl
%matplotlib inline
#https://matplotlib.org/tutorials/introductory/customizing.html
pl.rcParams['figure.figsize'] = 18, 8
pl.rcParams['lines.linewidth'] = 1
pl.rcParams['lines.markersize'] = 1
import warnings
warnings.simplefilter('ignore')
# if os.name == 'nt':
# #Windows:
# os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data")
# else:
# #Mac:
# os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data")
# d = pd.read_csv(r"airline-passengers.csv")
df = pd.read_csv("cleaned_data.csv")
# print(df)
d = df[["date", "US_Covid_Cases"]]
print('Original dataset:\n\n', d)
d.index = pd.DatetimeIndex(d.iloc[:, 0], freq='infer')
p = 14 #periodicity or period length
d = d.iloc[:, 1:] #remove row labels, i.e. index
y = d.iloc[:, 0] #keep only 1st column; here same as d
print('\nInferred time series:\n')
print(y)
def snf(m, o, p, y):
# m=method, o=output from fit(), p=periodicity, y=time series
#output summary & fit
print(2 * '\n', f'\033[1m')
print(m, 'Forecast:', f'\033[0m', '\n')
print(o.summary())
#ix = pd.DatetimeIndex(f.index.append(f2.index), freq=f.index.freqstr) #works but not used
#ix = pd.date_range(f.index[0], f2.index[-1], freq=f.index.freqstr) #works identically but not used
f = pd.concat([o.fittedvalues, o.forecast(p)], axis=0)
f.index.name = y.index.name
f.name = y.name
print('\n' + m, 'last few fitted & forecast values:\n')
print(f.tail(p + 3))
pl.title(y.name + ' (in red) & ' + m + ' fitted values (in blue)')
pl.plot(y, color='red')
pl.plot(f)
pl.show()
#o = ExponentialSmoothing(y.astype('double'), freq=y.index.freq, seasonal_periods=12, missing='raise')
o = ExponentialSmoothing(y, freq=y.index.freqstr, seasonal_periods=p).fit()
snf('Simple Exponential', o, p, y)
o = ExponentialSmoothing(y, freq=y.index.freqstr, trend='add', seasonal_periods=p).fit()
snf("Holt's", o, p, y)
# Use additive model as we have rows with 0 values!
o = ExponentialSmoothing(y, freq=y.index.freqstr, trend='add', seasonal='add', seasonal_periods=p).fit()
snf("Winters'", o, p, y)
Original dataset:
date US_Covid_Cases
0 2020-01-21 1.0
1 2020-01-22 1.0
2 2020-01-23 1.0
3 2020-01-24 2.0
4 2020-01-25 3.0
.. ... ...
434 2021-03-30 30416970.0
435 2021-03-31 30485232.0
436 2021-04-01 30562856.0
437 2021-04-02 30631700.0
438 2021-04-03 30631700.0
[439 rows x 2 columns]
Inferred time series:
date
2020-01-21 1.0
2020-01-22 1.0
2020-01-23 1.0
2020-01-24 2.0
2020-01-25 3.0
...
2021-03-30 30416970.0
2021-03-31 30485232.0
2021-04-01 30562856.0
2021-04-02 30631700.0
2021-04-03 30631700.0
Freq: D, Name: US_Covid_Cases, Length: 439, dtype: float64
Simple Exponential Forecast:
ExponentialSmoothing Model Results
================================================================================
Dep. Variable: US_Covid_Cases No. Observations: 439
Model: ExponentialSmoothing SSE 4185585773096.544
Optimized: True AIC 10091.416
Trend: None BIC 10099.585
Seasonal: None AICC 10091.508
Seasonal Periods: None Date: Thu, 22 Apr 2021
Box-Cox: False Time: 17:02:26
Box-Cox Coeff.: None
==============================================================================
coeff code optimized
------------------------------------------------------------------------------
smoothing_level 0.9950000 alpha True
initial_level 1.0000000 l.0 True
------------------------------------------------------------------------------
Simple Exponential last few fitted & forecast values:
date
2021-04-01 3.048489e+07
2021-04-02 3.056247e+07
2021-04-03 3.063135e+07
2021-04-04 3.063170e+07
2021-04-05 3.063170e+07
2021-04-06 3.063170e+07
2021-04-07 3.063170e+07
2021-04-08 3.063170e+07
2021-04-09 3.063170e+07
2021-04-10 3.063170e+07
2021-04-11 3.063170e+07
2021-04-12 3.063170e+07
2021-04-13 3.063170e+07
2021-04-14 3.063170e+07
2021-04-15 3.063170e+07
2021-04-16 3.063170e+07
2021-04-17 3.063170e+07
Freq: D, Name: US_Covid_Cases, dtype: float64
Holt's Forecast:
ExponentialSmoothing Model Results
================================================================================
Dep. Variable: US_Covid_Cases No. Observations: 439
Model: ExponentialSmoothing SSE 122089594427.002
Optimized: True AIC 8543.706
Trend: Additive BIC 8560.044
Seasonal: None AICC 8543.900
Seasonal Periods: None Date: Thu, 22 Apr 2021
Box-Cox: False Time: 17:02:27
Box-Cox Coeff.: None
==============================================================================
coeff code optimized
------------------------------------------------------------------------------
smoothing_level 0.9950000 alpha True
smoothing_trend 0.4027381 beta True
initial_level 1.0000000 l.0 True
initial_trend 0.000000 b.0 True
------------------------------------------------------------------------------
Holt's last few fitted & forecast values:
date
2021-04-01 3.055003e+07
2021-04-02 3.063276e+07
2021-04-03 3.070125e+07
2021-04-04 3.067372e+07
2021-04-05 3.071539e+07
2021-04-06 3.075707e+07
2021-04-07 3.079874e+07
2021-04-08 3.084042e+07
2021-04-09 3.088209e+07
2021-04-10 3.092376e+07
2021-04-11 3.096544e+07
2021-04-12 3.100711e+07
2021-04-13 3.104878e+07
2021-04-14 3.109046e+07
2021-04-15 3.113213e+07
2021-04-16 3.117380e+07
2021-04-17 3.121548e+07
Freq: D, Name: US_Covid_Cases, dtype: float64
Winters' Forecast:
ExponentialSmoothing Model Results
================================================================================
Dep. Variable: US_Covid_Cases No. Observations: 439
Model: ExponentialSmoothing SSE 110718824753.833
Optimized: True AIC 8528.789
Trend: Additive BIC 8602.310
Seasonal: Additive AICC 8530.798
Seasonal Periods: 14 Date: Thu, 22 Apr 2021
Box-Cox: False Time: 17:02:27
Box-Cox Coeff.: None
=================================================================================
coeff code optimized
---------------------------------------------------------------------------------
smoothing_level 0.8182143 alpha True
smoothing_trend 0.4720467 beta True
smoothing_seasonal 0.1211905 gamma True
initial_level 9.6298e+06 l.0 True
initial_trend 0.6377551 b.0 True
initial_seasons.0 -9.6298e+06 s.0 True
initial_seasons.1 -9.6298e+06 s.1 True
initial_seasons.2 -9.6298e+06 s.2 True
initial_seasons.3 -9.6298e+06 s.3 True
initial_seasons.4 -9.6298e+06 s.4 True
initial_seasons.5 -9.6298e+06 s.5 True
initial_seasons.6 -9.6298e+06 s.6 True
initial_seasons.7 -9.6298e+06 s.7 True
initial_seasons.8 -9.6298e+06 s.8 True
initial_seasons.9 -9.6298e+06 s.9 True
initial_seasons.10 -9.6298e+06 s.10 True
initial_seasons.11 -9.6298e+06 s.11 True
initial_seasons.12 -9.6298e+06 s.12 True
initial_seasons.13 -9.6298e+06 s.13 True
---------------------------------------------------------------------------------
Winters' last few fitted & forecast values:
date
2021-04-01 3.056254e+07
2021-04-02 3.063686e+07
2021-04-03 3.069161e+07
2021-04-04 3.065414e+07
2021-04-05 3.069209e+07
2021-04-06 3.073069e+07
2021-04-07 3.078259e+07
2021-04-08 3.083467e+07
2021-04-09 3.087990e+07
2021-04-10 3.090896e+07
2021-04-11 3.091762e+07
2021-04-12 3.094523e+07
2021-04-13 3.098235e+07
2021-04-14 3.103386e+07
2021-04-15 3.108508e+07
2021-04-16 3.113335e+07
2021-04-17 3.116980e+07
Freq: D, Name: US_Covid_Cases, dtype: float64
#Time Series ARIMA(p,d,q) Forecast
#p = any number of missing at the end, else 1
#delete any missing at the start,
# leave any missing at the end,
# interpolate in between,
# add more missing at the end for forecast
#data file has only 2 columns: time-serial & y
#begin of standard header
import os
import IPython
import sys
import numpy as np
import pandas as pd
import scipy
import statsmodels as sm
import matplotlib.pyplot as pl
%matplotlib inline
#for debugging compatibility issues:
a = ! jupyter-notebook --version
v = ! conda --version
print('\nVersions:', str(v)[str(v).find('conda'):-2] + ' jupyter-notebook', str(a)[2:-2]
+ ' IPython', ".".join(map(str, IPython.version_info[:3])) + ' Python', ".".join(map(str, sys.version_info[:3]))
+ ' numpy', np.__version__ + ' pandas', pd.__version__ + ' scipy', scipy.__version__)
try:
print(' ' * 9, 'statsmodels', sm.__version__)
except:
#for Azure server (2019):
print(' ' * 9, '...with older version of statsmodels')
del a, v
#!conda update --y --all #change #! to ! will update any of your older versions
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>")) #set height of output window below
pl.rcParams['figure.figsize'] = 18, 8 #width & height for time-series plot
#pl.rcParams['figure.figsize'] = 6.000, 6.143 #best for square plot np.arange(1,10) on screens of diff resolutions?
pl.rcParams['lines.linewidth'] = 1
pl.rcParams['lines.markersize'] = 1
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
if os.name == 'nt':
#Windows:
try:
os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data")
except:
#assume file is in current folder
pass
else:
#Mac:
try:
os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data")
except:
#assume file is in current folder
pass
#end of standard header
try:
#file has only 2 columns: time-serial & y:
# y = pd.read_csv('daily-total-female-births.csv', index_col=0) #has 7 NaN(s) in the tail, for filling in forecasts
y = pd.read_csv("cleaned_data.csv", index_col=0)
y["date"] = pd.to_datetime(y["date"])
y.index = y["date"]
y = y["cumCasesByPublishDate"]
# y = pd.read_csv('UKDeaths_1970-2018.csv', index_col=0) #has 7 NaN(s) in the tail, for filling in forecasts
except:
print('"'+os.getcwd()+'" is the current folder in the', 'Windows' if os.name == 'nt' else os.name,
'operating system:\n')
print('\n'.join(os.listdir()), '\n') #see what's in current folder
raise SystemExit
y.index = pd.to_datetime(y.index) #change index to DatetimeIndex; y is now a time series
r = pd.infer_freq(y.index) #infer frequency from index
start_idx = len(y) - 1
#some of the following 6 lines is not necessary for some datasets:
y = pd.DataFrame(y).iloc[:, :1] #just to make sure y is DataFrame of 1 column, instead of possibly a Series
y.interpolate(limit_area='inside', inplace=True) #interpolate, but not extrapolate for either end
#add in a random few more missing at the end, for further predictions:
a = [np.nan for i in range(0, 14)]
y = y.append(pd.DataFrame(a, index=pd.date_range(y.index[-1], periods=len(a)+1,
freq=r)[1:], columns=y.columns.values))
# y.iloc[:2, :] = np.nan #artificially create missing at the start
#https://stackoverflow.com/questions/18327624/find-elements-index-in-pandas-series
#y.isnull().ne(True).idxmax()[0] = index of first non-missing
b = list(y.index).index(y.isnull().ne(True).idxmax()[0]) + 1 #position of first non-missing at the start
print(y.head(max(b, 3))) #show all of any missing at the start
print(y.tail(max((len(y) - y.count())[0] - b + 2, 3))) #shows all of any missing at the end
y = y.iloc[b - 1:, :] #delete any missing at the start
from statsmodels.tsa.arima_model import ARIMA
#p=number of missing (NaN); also used as order of autoregression (can be set independently):
p = (len(y) - y.count())[0] if type(len(y) - y.count()) == type(pd.Series([])) else 1 #set to 1 if no missing at end
d = 0 #times to difference y: order of y's integration; max is 2
q = 1 #order of moving average
print('\nFitting (p,d,q) = (' + str(p) + ',' + str(d) + ',' + str(q)
+ ') ... [p = any number of missing at the end, else 1]\n')
m = ARIMA(y[:-p], (p,d,q), freq=r).fit(transparams=True) #fits
print(m.summary())
print('\nbic =', m.bic) #smaller is better
#predict with start=y.index[d], end=y.index[-1]
# f = m.predict(y.index[d], y.index[-1], typ='levels') #forecasts
end_idx = start_idx + 14
f = m.predict(start=start_idx, end=end_idx)
#ARIMA always gives f a DatetimeIndex, even if y hasn't a DatetimeIndex
print('\nFits &', p, 'Forecasts:\n')
print(f.head(3))
print(f.tail(p + 1))
pl.plot(y, color='red')
pl.title('Observed Series (in red) & ARIMA('+str(p)+','+str(d)+','+str(q)+') Fit & Forecast')
pl.plot(f)
pl.show()
if len(f) > 400:
pl.plot(y.iloc[-375:, :], color='red')
pl.title('Observed Series (in red) & ARIMA('+str(p)+','+str(d)+','+str(q)+') Fit & Forecast')
pl.plot(f[-375:])
pl.show()
Versions: conda 4.9.2 jupyter-notebook 6.1.6 IPython 7.19.0 Python 3.7.9 numpy 1.19.2 pandas 1.2.0 scipy 1.5.2
statsmodels 0.12.1
cumCasesByPublishDate
2020-01-21 0.0
2020-01-22 0.0
2020-01-23 0.0
cumCasesByPublishDate
2021-04-03 4357091.0
2021-04-04 NaN
2021-04-05 NaN
2021-04-06 NaN
2021-04-07 NaN
2021-04-08 NaN
2021-04-09 NaN
2021-04-10 NaN
2021-04-11 NaN
2021-04-12 NaN
2021-04-13 NaN
2021-04-14 NaN
2021-04-15 NaN
2021-04-16 NaN
2021-04-17 NaN
Fitting (p,d,q) = (14,0,1) ... [p = any number of missing at the end, else 1]
ARMA Model Results
=================================================================================
Dep. Variable: cumCasesByPublishDate No. Observations: 439
Model: ARMA(14, 1) Log Likelihood -4129.411
Method: css-mle S.D. of innovations nan
Date: Thu, 22 Apr 2021 AIC 8292.822
Time: 13:51:24 BIC 8362.258
Sample: 01-21-2020 HQIC 8320.217
- 04-03-2021
================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------
const 1.216e+06 1.54e+06 0.790 0.429 -1.8e+06 4.23e+06
ar.L1.cumCasesByPublishDate 1.8262 nan nan nan nan nan
ar.L2.cumCasesByPublishDate -0.7719 nan nan nan nan nan
ar.L3.cumCasesByPublishDate -0.0391 nan nan nan nan nan
ar.L4.cumCasesByPublishDate 0.0188 nan nan nan nan nan
ar.L5.cumCasesByPublishDate 0.0759 nan nan nan nan nan
ar.L6.cumCasesByPublishDate 0.0101 nan nan nan nan nan
ar.L7.cumCasesByPublishDate -0.0328 nan nan nan nan nan
ar.L8.cumCasesByPublishDate -0.1906 nan nan nan nan nan
ar.L9.cumCasesByPublishDate 0.0460 nan nan nan nan nan
ar.L10.cumCasesByPublishDate -0.0093 nan nan nan nan nan
ar.L11.cumCasesByPublishDate 0.0581 nan nan nan nan nan
ar.L12.cumCasesByPublishDate -0.0793 nan nan nan nan nan
ar.L13.cumCasesByPublishDate 0.1552 nan nan nan nan nan
ar.L14.cumCasesByPublishDate -0.0673 nan nan nan nan nan
ma.L1.cumCasesByPublishDate -0.2747 0.046 -5.992 0.000 -0.365 -0.185
Roots
==============================================================================
Real Imaginary Modulus Frequency
------------------------------------------------------------------------------
AR.1 -1.1569 -0.3427j 1.2066 -0.4542
AR.2 -1.1569 +0.3427j 1.2066 0.4542
AR.3 -0.8887 -0.8846j 1.2539 -0.3754
AR.4 -0.8887 +0.8846j 1.2539 0.3754
AR.5 -0.3207 -1.1705j 1.2136 -0.2926
AR.6 -0.3207 +1.1705j 1.2136 0.2926
AR.7 0.3443 -1.2798j 1.3253 -0.2082
AR.8 0.3443 +1.2798j 1.3253 0.2082
AR.9 0.7079 -0.8544j 1.1096 -0.1399
AR.10 0.7079 +0.8544j 1.1096 0.1399
AR.11 1.0066 -0.0058j 1.0066 -0.0009
AR.12 1.0066 +0.0058j 1.0066 0.0009
AR.13 1.1116 -0.0000j 1.1116 -0.0000
AR.14 1.8084 -0.0000j 1.8084 -0.0000
MA.1 3.6408 +0.0000j 3.6408 0.0000
------------------------------------------------------------------------------
bic = 8362.25811436098
Fits & 14 Forecasts:
2021-04-03 4.356660e+06
2021-04-04 4.360043e+06
2021-04-05 4.362834e+06
Freq: D, dtype: float64
2021-04-03 4.356660e+06
2021-04-04 4.360043e+06
2021-04-05 4.362834e+06
2021-04-06 4.365458e+06
2021-04-07 4.367946e+06
2021-04-08 4.370387e+06
2021-04-09 4.372463e+06
2021-04-10 4.374176e+06
2021-04-11 4.375624e+06
2021-04-12 4.376850e+06
2021-04-13 4.377839e+06
2021-04-14 4.378726e+06
2021-04-15 4.379380e+06
2021-04-16 4.379789e+06
2021-04-17 4.379917e+06
Freq: D, dtype: float64
#Time Series ARIMA(p,d,q) Forecast
#p = any number of missing at the end, else 1
#delete any missing at the start,
# leave any missing at the end,
# interpolate in between,
# add more missing at the end for forecast
#data file has only 2 columns: time-serial & y
#begin of standard header
import os
import IPython
import sys
import numpy as np
import pandas as pd
import scipy
import statsmodels as sm
import matplotlib.pyplot as pl
%matplotlib inline
#for debugging compatibility issues:
a = ! jupyter-notebook --version
v = ! conda --version
print('\nVersions:', str(v)[str(v).find('conda'):-2] + ' jupyter-notebook', str(a)[2:-2]
+ ' IPython', ".".join(map(str, IPython.version_info[:3])) + ' Python', ".".join(map(str, sys.version_info[:3]))
+ ' numpy', np.__version__ + ' pandas', pd.__version__ + ' scipy', scipy.__version__)
try:
print(' ' * 9, 'statsmodels', sm.__version__)
except:
#for Azure server (2019):
print(' ' * 9, '...with older version of statsmodels')
del a, v
#!conda update --y --all #change #! to ! will update any of your older versions
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 31em; }</style>")) #set height of output window below
pl.rcParams['figure.figsize'] = 18, 8 #width & height for time-series plot
#pl.rcParams['figure.figsize'] = 6.000, 6.143 #best for square plot np.arange(1,10) on screens of diff resolutions?
pl.rcParams['lines.linewidth'] = 1
pl.rcParams['lines.markersize'] = 1
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
if os.name == 'nt':
#Windows:
try:
os.chdir(os.environ['USERPROFILE'] + '\Documents' + r"\0_Teach\data")
except:
#assume file is in current folder
pass
else:
#Mac:
try:
os.chdir('/Users/' + os.environ['USER'] + '/Documents' + r"/0_Teach/data")
except:
#assume file is in current folder
pass
#end of standard header
try:
#file has only 2 columns: time-serial & y:
# y = pd.read_csv('daily-total-female-births.csv', index_col=0) #has 7 NaN(s) in the tail, for filling in forecasts
y = pd.read_csv("cleaned_data.csv", index_col=0)
y["date"] = pd.to_datetime(y["date"])
y.index = y["date"]
y = y["US_Covid_Cases"]
# y = pd.read_csv('UKDeaths_1970-2018.csv', index_col=0) #has 7 NaN(s) in the tail, for filling in forecasts
except:
print('"'+os.getcwd()+'" is the current folder in the', 'Windows' if os.name == 'nt' else os.name,
'operating system:\n')
print('\n'.join(os.listdir()), '\n') #see what's in current folder
raise SystemExit
y.index = pd.to_datetime(y.index) #change index to DatetimeIndex; y is now a time series
r = pd.infer_freq(y.index) #infer frequency from index
start_idx = len(y) - 1
#some of the following 6 lines is not necessary for some datasets:
y = pd.DataFrame(y).iloc[:, :1] #just to make sure y is DataFrame of 1 column, instead of possibly a Series
y.interpolate(limit_area='inside', inplace=True) #interpolate, but not extrapolate for either end
#add in a random few more missing at the end, for further predictions:
a = [np.nan for i in range(0, 14)]
y = y.append(pd.DataFrame(a, index=pd.date_range(y.index[-1], periods=len(a)+1,
freq=r)[1:], columns=y.columns.values))
# y.iloc[:2, :] = np.nan #artificially create missing at the start
#https://stackoverflow.com/questions/18327624/find-elements-index-in-pandas-series
#y.isnull().ne(True).idxmax()[0] = index of first non-missing
b = list(y.index).index(y.isnull().ne(True).idxmax()[0]) + 1 #position of first non-missing at the start
print(y.head(max(b, 3))) #show all of any missing at the start
print(y.tail(max((len(y) - y.count())[0] - b + 2, 3))) #shows all of any missing at the end
y = y.iloc[b - 1:, :] #delete any missing at the start
from statsmodels.tsa.arima_model import ARIMA
#p=number of missing (NaN); also used as order of autoregression (can be set independently):
p = (len(y) - y.count())[0] if type(len(y) - y.count()) == type(pd.Series([])) else 1 #set to 1 if no missing at end
d = 1 #times to difference y: order of y's integration; max is 2
q = 1 #order of moving average
print('\nFitting (p,d,q) = (' + str(p) + ',' + str(d) + ',' + str(q)
+ ') ... [p = any number of missing at the end, else 1]\n')
m = ARIMA(y[:-p], (p,d,q), freq=r).fit(transparams=True) #fits
print(m.summary())
print('\nbic =', m.bic) #smaller is better
#predict with start=y.index[d], end=y.index[-1]
# f = m.predict(y.index[d], y.index[-1], typ='levels') #forecasts
end_idx = start_idx + 14
f = m.predict(start=start_idx, end=end_idx)
#ARIMA always gives f a DatetimeIndex, even if y hasn't a DatetimeIndex
print('\nFits &', p, 'Forecasts:\n')
print(f.head(3))
print(f.tail(p + 1))
pl.plot(y, color='red')
pl.title('Observed Series (in red) & ARIMA('+str(p)+','+str(d)+','+str(q)+') Fit & Forecast')
pl.plot(f)
pl.show()
if len(f) > 400:
pl.plot(y.iloc[-375:, :], color='red')
pl.title('Observed Series (in red) & ARIMA('+str(p)+','+str(d)+','+str(q)+') Fit & Forecast')
pl.plot(f[-375:])
pl.show()
print(len(f))
Versions: conda 4.9.2 jupyter-notebook 6.1.6 IPython 7.19.0 Python 3.7.9 numpy 1.19.2 pandas 1.2.0 scipy 1.5.2
statsmodels 0.12.1
US_Covid_Cases
2020-01-21 1.0
2020-01-22 1.0
2020-01-23 1.0
US_Covid_Cases
2021-04-03 30631700.0
2021-04-04 NaN
2021-04-05 NaN
2021-04-06 NaN
2021-04-07 NaN
2021-04-08 NaN
2021-04-09 NaN
2021-04-10 NaN
2021-04-11 NaN
2021-04-12 NaN
2021-04-13 NaN
2021-04-14 NaN
2021-04-15 NaN
2021-04-16 NaN
2021-04-17 NaN
Fitting (p,d,q) = (14,1,1) ... [p = any number of missing at the end, else 1]
ARIMA Model Results
==============================================================================
Dep. Variable: D.US_Covid_Cases No. Observations: 438
Model: ARIMA(14, 1, 1) Log Likelihood -4814.715
Method: css-mle S.D. of innovations 14285.204
Date: Thu, 22 Apr 2021 AIC 9663.429
Time: 13:51:49 BIC 9732.827
Sample: 01-22-2020 HQIC 9690.812
- 04-03-2021
===========================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------
const 6.994e+04 4.11e+04 1.700 0.089 -1.07e+04 1.51e+05
ar.L1.D.US_Covid_Cases 0.8393 0.222 3.786 0.000 0.405 1.274
ar.L2.D.US_Covid_Cases 0.1367 0.097 1.415 0.157 -0.053 0.326
ar.L3.D.US_Covid_Cases -0.0203 0.093 -0.219 0.827 -0.202 0.162
ar.L4.D.US_Covid_Cases 0.0669 0.070 0.955 0.340 -0.070 0.204
ar.L5.D.US_Covid_Cases 0.0158 0.070 0.226 0.821 -0.121 0.153
ar.L6.D.US_Covid_Cases 0.0609 0.065 0.936 0.349 -0.067 0.189
ar.L7.D.US_Covid_Cases 0.3631 0.062 5.859 0.000 0.242 0.485
ar.L8.D.US_Covid_Cases -0.3937 0.110 -3.568 0.000 -0.610 -0.177
ar.L9.D.US_Covid_Cases -0.1465 0.073 -1.998 0.046 -0.290 -0.003
ar.L10.D.US_Covid_Cases -3.482e-05 0.081 -0.000 1.000 -0.159 0.159
ar.L11.D.US_Covid_Cases -0.0659 0.068 -0.970 0.332 -0.199 0.067
ar.L12.D.US_Covid_Cases 0.0456 0.069 0.660 0.509 -0.090 0.181
ar.L13.D.US_Covid_Cases 0.0872 0.063 1.381 0.167 -0.037 0.211
ar.L14.D.US_Covid_Cases 0.0041 0.059 0.070 0.944 -0.111 0.119
ma.L1.D.US_Covid_Cases -0.5100 0.216 -2.356 0.018 -0.934 -0.086
Roots
==============================================================================
Real Imaginary Modulus Frequency
------------------------------------------------------------------------------
AR.1 1.0082 -0.0000j 1.0082 -0.0000
AR.2 1.1363 -0.1960j 1.1531 -0.0272
AR.3 1.1363 +0.1960j 1.1531 0.0272
AR.4 0.6351 -0.8276j 1.0432 -0.1458
AR.5 0.6351 +0.8276j 1.0432 0.1458
AR.6 -0.2920 -1.0724j 1.1114 -0.2923
AR.7 -0.2920 +1.0724j 1.1114 0.2923
AR.8 0.1910 -1.3964j 1.4094 -0.2284
AR.9 0.1910 +1.3964j 1.4094 0.2284
AR.10 -1.0607 -0.5517j 1.1956 -0.4237
AR.11 -1.0607 +0.5517j 1.1956 0.4237
AR.12 -1.4016 -0.5832j 1.5181 -0.4372
AR.13 -1.4016 +0.5832j 1.5181 0.4372
AR.14 -20.7153 -0.0000j 20.7153 -0.5000
MA.1 1.9609 +0.0000j 1.9609 0.0000
------------------------------------------------------------------------------
bic = 9732.827160791232
Fits & 14 Forecasts:
2021-04-03 60322.994071
2021-04-04 32661.972845
2021-04-05 39927.939057
Freq: D, dtype: float64
2021-04-03 60322.994071
2021-04-04 32661.972845
2021-04-05 39927.939057
2021-04-06 41964.454241
2021-04-07 42096.480841
2021-04-08 45938.452100
2021-04-09 34530.873876
2021-04-10 5137.711480
2021-04-11 20953.332815
2021-04-12 29696.882049
2021-04-13 33707.486424
2021-04-14 39195.505264
2021-04-15 39092.247978
2021-04-16 28017.217564
2021-04-17 15985.527981
Freq: D, dtype: float64
15
# since we do 1 order of differencing, we must use this code to get the actual changes in COVID cases in the US
start_val = y.iloc[-15]["US_Covid_Cases"]
hval = start_val
qf = list(f)
for i in range(0, 14):
q = hval + qf[i]
# print(q)
y.iloc[-14+i]["US_Covid_Cases"] = q
hval = q
print(y.tail(15))
US_Covid_Cases 2021-04-03 3.063170e+07 2021-04-04 3.069202e+07 2021-04-05 3.072468e+07 2021-04-06 3.076461e+07 2021-04-07 3.080658e+07 2021-04-08 3.084867e+07 2021-04-09 3.089461e+07 2021-04-10 3.092914e+07 2021-04-11 3.093428e+07 2021-04-12 3.095523e+07 2021-04-13 3.098493e+07 2021-04-14 3.101864e+07 2021-04-15 3.105783e+07 2021-04-16 3.109693e+07 2021-04-17 3.112494e+07